Source code for pytadbit.parsers.sam_parser

"""
17 nov. 2014
"""
from __future__ import print_function
from builtins import next

from itertools import combinations
from bisect import bisect_right as bisect
from pysam import Samfile
from pytadbit.mapping.restriction_enzymes import map_re_sites
from shutil import copyfileobj
from warnings import warn
import os
from sys import stdout

try:
    basestring
except NameError:
    basestring = str


[docs]def parse_sam(f_names1, f_names2=None, out_file1=None, out_file2=None,
              genome_seq=None, re_name=None, verbose=False, clean=True,
              mapper=None, **kwargs):
    """
    Parse sam/bam file using pysam tools.

    Keep a summary of the results into 2 tab-separated files that will contain 6
       columns: read ID, Chromosome, position, strand (either 0 or 1), mapped
       sequence lebgth, position of the closest upstream RE site, position of
       the closest downstream RE site

    :param f_names1: a list of path to sam/bam files corresponding to the
       mapping of read1, can also  be just one file
    :param f_names1: a list of path to sam/bam files corresponding to the
       mapping of read2, can also  be just one file
    :param out_file1: path to outfile tab separated format containing mapped
       read1 information
    :param out_file1: path to outfile tab separated format containing mapped
       read2 information
    :param genome_seq: a dictionary generated by :func:`pyatdbit.parser.genome_parser.parse_fasta`.
       containing the genomic sequence
    :param re_name: name of the restriction enzyme used
    :param None mapper: software used to map (supported are GEM and BOWTIE2).
       Guessed from file by default.
    """
    # not nice, dirty fix in order to allow this function to only parse
    # one SAM file
    if not out_file1:
        raise Exception('ERROR: out_file1 should be given\n')
    if not re_name:
        raise Exception('ERROR: re_name should be given\n')
    if not genome_seq:
        raise Exception('ERROR: genome_seq should be given\n')
    if (f_names2 and not out_file2) or (not f_names2 and out_file2):
        raise Exception('ERROR: out_file2 AND f_names2 needed\n')

    frag_chunk = kwargs.get('frag_chunk', 100000)
    if verbose:
        print('Searching and mapping RE sites to the reference genome')
    frags = map_re_sites(re_name, genome_seq, frag_chunk=frag_chunk,
                         verbose=verbose)

    if isinstance(f_names1, basestring):
        f_names1 = [f_names1]
    if isinstance(f_names2, basestring):
        f_names2 = [f_names2]
    if f_names2:
        fnames = f_names1, f_names2
        outfiles = out_file1, out_file2
    else:
        fnames = (f_names1,)
        outfiles = (out_file1, )

    # max number of reads per intermediate files for sorting
    max_size = 1000000

    windows = {}
    multis  = {}
    procs   = []
    for read in range(len(fnames)):
        if verbose:
            print('Loading read' + str(read + 1))
        windows[read] = {}
        num = 0
        # iteration over reads
        nfile = 0
        tmp_files = []
        reads     = []
        for fnam in fnames[read]:
            try:
                fhandler = Samfile(fnam)
            except IOError:
                print('WARNING: file "%s" not found' % fnam)
                continue
            except ValueError:
                raise Exception('ERROR: not a SAM/BAM file\n%s' % fnam)
            # get the iteration number of the iterative mapping
            try:
                num = int(fnam.split('.')[-1].split(':')[0])
            except:
                num += 1
            # set read counter
            windows[read].setdefault(num, 0)
            # guess mapper used
            if not mapper:
                mapper = fhandler.header['PG'][0]['ID']
            if mapper.lower()=='gem':
                condition = lambda x: x[1][0][0] != 'N'
            elif mapper.lower() in ['bowtie', 'bowtie2']:
                condition = lambda x: 'XS' == x[0][0]
            else:
                warn('WARNING: unrecognized mapper used to generate file\n')
                condition = lambda x: x[1][1] != 1
            if verbose:
                print('loading SAM file from %s: %s' % (mapper, fnam))
            # getrname chromosome names
            i = 0
            crm_dict = {}
            while True:
                try:
                    crm_dict[i] = fhandler.getrname(i)
                    i += 1
                except ValueError:
                    break
            # iteration over reads
            sub_count = 0  # to empty read buffer
            for r in fhandler:
                if r.is_unmapped:
                    continue
                if condition(r.tags):
                    continue
                positive = not r.is_reverse
                crm      = crm_dict[r.tid]
                len_seq  = len(r.seq)
                if positive:
                    pos = r.pos + 1
                else:
                    pos = r.pos + len_seq
                try:
                    frag_piece = frags[crm][pos // frag_chunk]
                except KeyError:
                    # Chromosome not in hash
                    continue
                idx = bisect(frag_piece, pos)
                try:
                    next_re = frag_piece[idx]
                except IndexError:
                    # case where part of the read is mapped outside chromosome
                    count = 0
                    while idx >= len(frag_piece) and count < len_seq:
                        pos -= 1
                        count += 1
                        frag_piece = frags[crm][pos // frag_chunk]
                        idx = bisect(frag_piece, pos)
                    if count >= len_seq:
                        raise Exception('Read mapped mostly outside ' +
                                        'chromosome\n(also reference genome can be truncated)')
                    next_re    = frag_piece[idx]
                prev_re    = frag_piece[idx - 1 if idx else 0]
                name       = r.qname
                reads.append('%s\t%s\t%d\t%d\t%d\t%d\t%d\n' % (
                    name, crm, pos, positive, len_seq, prev_re, next_re))
                windows[read][num] += 1
                sub_count += 1
                if sub_count >= max_size:
                    sub_count = 0
                    nfile += 1
                    write_reads_to_file(reads, outfiles[read], tmp_files, nfile)
            nfile += 1
            write_reads_to_file(reads, outfiles[read], tmp_files, nfile)


        # we have now sorted temporary files
        # we do merge sort for eah pair
        if verbose:
            stdout.write('Merge sort')
            stdout.flush()
        while len(tmp_files) > 1:
            file1 = tmp_files.pop(0)
            try:
                file2 = tmp_files.pop(0)
            except IndexError:
                break
            if verbose:
                stdout.write('.')
            stdout.flush()
            nfile += 1
            tmp_files.append(merge_sort(file1, file2, outfiles[read], nfile))
        if verbose:
            stdout.write('\n')
        tmp_name = tmp_files[0]

        if verbose:
            print('Getting Multiple contacts')
        reads_fh = open(outfiles[read], 'w')
        ## Also pipe file header
        # chromosome sizes (in order)
        reads_fh.write('# Chromosome lengths (order matters):\n')
        for crm in genome_seq:
            reads_fh.write('# CRM %s\t%d\n' % (crm, len(genome_seq[crm])))
        reads_fh.write('# Mapped\treads count by iteration\n')
        for size in windows[read]:
            reads_fh.write('# MAPPED %d %d\n' % (size, windows[read][size]))

        ## Multicontacts
        tmp_reads_fh = open(tmp_name)
        try:
            read_line = next(tmp_reads_fh)
        except StopIteration:
            raise StopIteration('ERROR!\n Nothing parsed, check input files and'
                                ' chromosome names (in genome.fasta and SAM/MAP'
                                ' files).')
        prev_head = read_line.split('\t', 1)[0]
        prev_head = prev_head.split('~' , 1)[0]
        prev_read = read_line
        multis[read] = {}
        multi = 0
        for read_line in tmp_reads_fh:
            head = read_line.split('\t', 1)[0]
            head = head.split('~' , 1)[0]
            if head == prev_head:
                prev_read =  prev_read.strip() + '|||' + read_line
                multi += 1
            else:
                reads_fh.write(prev_read)
                prev_read = read_line
                try:
                    multis[read][multi] += 1
                except KeyError:
                    multis[read][multi] = 1
                multi = 0
            prev_head = head
        reads_fh.write(prev_read)
        reads_fh.close()
        tmp_reads_fh.close()
        if clean:
            os.system('rm -rf ' + tmp_name)
    # wait for compression to finish
    for p in procs:
        p.communicate()
    return windows, multis

def parse_gem_3c(f_name, out_file, genome_lengths, frags, verbose=False,
                 tmp_format=False, **kwargs):
    """
    Parse gem 3c sam file using pysam tools.

    :param f_name: path to sam file corresponding to the mapping of reads
    :param out_file: path to outfile tab separated format containing paired read information
    :param genome_lengths: a dictionary generated containing the length of the genomic sequence
                           per chromosome
    :param False tmp_format: If True leave the file prepared to be merged with other map files.
    """

    frag_chunk = kwargs.get('frag_chunk', 100000)
    try:
        fhandler = Samfile(f_name)
    except IOError:
        raise Exception('ERROR: file "%s" not found' % f_name)

    # max number of reads in buffer
    max_size = 1000000

    # getrname chromosome names
    i = 0
    crm_dict = {}
    while True:
        try:
            crm_dict[i] = fhandler.getrname(i)
            i += 1
        except ValueError:
            break
    # iteration over reads
    sub_count = 0
    nfile = 0
    tmp_files = []
    reads = []
    cur_name = ''
    write_pairs = False
    read1 = None
    read2 = []
    samiter = fhandler.fetch(until_eof=True)
    r = None
    try:
        r = next(samiter)
    except StopIteration:
        # empty SAM file
        return None
        pass
    while r:
        if not r.is_paired or r.is_unmapped or r.mapq < 4:
            try:
                r = next(samiter)
            except StopIteration:
                break
            continue

        if r.is_read1 and cur_name != r.qname:
            if read1 is None:
                read1 = r
                cur_name = r.qname
                try:
                    r = next(samiter)
                except StopIteration:
                    break
                continue
            else:
                write_pairs = True

        if not write_pairs:
            if r.is_read2 or r.is_supplementary:
                read2.append(r)
                try:
                    r = next(samiter)
                except StopIteration:
                    break
                continue
        else:
            if not read2:
                write_pairs = False
                read1 = None
                try:
                    r = next(samiter)
                except StopIteration:
                    break
                continue
            reads_grp = []
            read_id = read1.query_name
            for read in [read1]+read2:
                if read.query_name != read_id:
                    continue
                positive = not read.is_reverse
                crm      = crm_dict[read.tid]
                len_seq  = read.reference_end-read.pos
                if positive:
                    pos = read.pos + 1
                else:
                    pos = read.pos + len_seq
                try:
                    frag_piece = frags[crm][pos // frag_chunk]
                except KeyError:
                    # Chromosome not in hash
                    read_multi = []
                    break
                idx = bisect(frag_piece, pos)
                try:
                    next_re = frag_piece[idx]
                except IndexError:
                    # case where part of the read is mapped outside chromosome
                    count = 0
                    while idx >= len(frag_piece) and count < len_seq:
                        pos -= 1
                        count += 1
                        frag_piece = frags[crm][pos // frag_chunk]
                        idx = bisect(frag_piece, pos)
                    if count >= len_seq:
                        raise Exception('Read mapped mostly outside ' +
                                        'chromosome\n')
                    next_re    = frag_piece[idx]
                prev_re    = frag_piece[idx - 1 if idx else 0]
                reads_grp.append([read.tid, crm, pos, positive,
                                  len_seq, prev_re, next_re])
            if len(reads_grp) > 2:
                _merge_multis(reads_grp)
            elif len(reads_grp) < 2:
                reads_grp = []
            reads_multi = []
            for paired_reads in combinations(reads_grp, 2):
                read_multi = [item for sublist in sorted(paired_reads,key = lambda x: (x[0], x[2]))
                              for item in sublist]
                if read_multi:
                    reads_multi.append(read_multi)
                sub_count += 1

            paired_total = len(reads_multi)
            paired_nbr = 0
            for pair_read in reads_multi:
                read_name_id = read_id
                paired_nbr += 1
                if paired_total > 1:
                    read_name_id += '#%d/%d' % (paired_nbr,paired_total)
                reads.append([read_name_id]+pair_read)

            if sub_count >= max_size:
                sub_count = 0
                nfile += 1
                reads = sorted(reads, key = lambda x: (x[1], x[3], x[8], x[10]))
                read_lines = ['%s\t%d\t%s\t%d\t%d\t%d\t%d\t%d\t%d\t%s\t%d\t%d\t%d\t%d\t%d\n'
                              % tuple(read) for read in reads]
                write_paired_reads_to_file(read_lines, out_file, tmp_files, nfile)
                #map_out.write('\n'.join(reads)+'\n')
                del reads[:]
            write_pairs = False
            read1 = None
            del read2[:]
    if reads:
        nfile += 1
        reads = sorted(reads, key = lambda x: (x[1], x[3], x[8], x[10]))
        read_lines = ['%s\t%d\t%s\t%d\t%d\t%d\t%d\t%d\t%d\t%s\t%d\t%d\t%d\t%d\t%d\n'
                      % tuple(read) for read in reads]
        write_paired_reads_to_file(read_lines, out_file, tmp_files, nfile)
        #map_out.write('\n'.join(reads))

    #map_out.close()
    # we have now sorted temporary files
    # we do merge sort for eah pair
    if verbose:
        stdout.write('Merge sort')
        stdout.flush()
    while len(tmp_files) > 1:
        file1 = tmp_files.pop(0)
        try:
            file2 = tmp_files.pop(0)
        except IndexError:
            break
        if verbose:
            stdout.write('.')
        stdout.flush()
        nfile += 1
        tmp_files.append(merge_sort(file1, file2, out_file, nfile, paired=True))
    if verbose:
        stdout.write('\n')

    if tmp_format:
        os.rename(tmp_files[0], out_file)
    else:
        map_out   = open(out_file, 'w')
        tmp_reads_fh = open(tmp_files[0],'rb')
        for crm in genome_lengths:
            map_out.write('# CRM %s\t%d\n' % (crm, genome_lengths[crm]))
        for read_line in tmp_reads_fh:
            read = read_line.split('\t')
            map_out.write('\t'.join([read[0]]+read[2:8]+read[9:]))
        map_out.close()
        os.system('rm -rf ' + tmp_files[0])

    return out_file

def write_reads_to_file(reads, outfiles, tmp_files, nfile):
    if not reads: # can be...
        return
    tmp_name = os.path.join(*outfiles.split('/')[:-1] +
                            [('tmp_%03d_' % nfile) + outfiles.split('/')[-1]])
    tmp_name = ('/' * outfiles.startswith('/')) + tmp_name
    tmp_files.append(tmp_name)
    out = open(tmp_name, 'w')
    out.write(''.join(sorted(reads, key=lambda x: x.split('\t', 1)[0].split('~')[0])))
    out.close()
    del(reads[:]) # empty list

def write_paired_reads_to_file(reads, outfiles, tmp_files, nfile):
    if not reads: # can be...
        return
    tmp_name = os.path.join(*outfiles.split('/')[:-1] +
                            [('tmp_%03d_' % nfile) + outfiles.split('/')[-1]])
    tmp_name = ('/' * outfiles.startswith('/')) + tmp_name
    tmp_files.append(tmp_name)
    out = open(tmp_name, 'w')
    out.write(''.join(reads))
    out.close()
    del(reads[:]) # empty list

def _merge_multis(reads_multi):

    merged_reads = []
    elts = {}
    for read in reads_multi:
        elts.setdefault((read[1], read[5], read[6]), []).append(read)
    # write contacts by pairs
    # loop over RE fragments
    for elt in elts:
        # case we have 1 read-frags inside current fragment
        if len(elts[elt]) == 1:
            merged_reads.append(elts[elt][0])
        # case all fragments felt into a single RE frag
        # we take only first and last
        elif len(elts) == 1:
            elts[elt] = sorted(
                elts[elt], key=lambda x: int(x[2]))[::len(elts[elt])-1]
            merged_reads.append(elts[elt][0])
            merged_reads.append(elts[elt][1])
        # case we have several read-frag in this RE fragment
        else:
            # take first and last
            map1, map2 = sorted(
                elts[elt], key=lambda x: int(x[2]))[::len(elts[elt])-1]
            strand = map1[3]
            # if the 2 strands are different keep the longest fragment
            if strand != map2[3]:
                map1 = max(elts[elt], key=lambda x: int(x[4]))
                merged_reads.append(map1)
                continue
            # sum up read-frags in the RE fragment  by putting
            # them on the same strand
            # use the strand of the first fragment as reference
            if strand == 1:
                beg = int(map1[2])
                nts = int(map2[2]) + int(map2[4]) - beg
            else:
                beg = int(map2[2])
                nts = beg - (int(map1[2]) - (int(map1[4])))
            merged_reads.append(list(map1[:2]) + [str(beg), strand, str(nts)] + list(map1[5:]))

    reads_multi = merged_reads


def merge_sort(file1, file2, outfiles, nfile, paired=False):
    tmp_name = os.path.join(*outfiles.split('/')[:-1] +
                            [('tmp_merged_%03d_' % nfile) + outfiles.split('/')[-1]])
    tmp_name = ('/' * outfiles.startswith('/')) + tmp_name
    tmp_file = open(tmp_name, 'w')
    fh1 = open(file1)
    fh2 = open(file2)
    if paired:
        greater = lambda x, y: [y,x] == sorted([y,x], key=lambda j: (int(j.split('\t')[1]),
                                                                 float(j.split('\t')[3]),
                                                                 int(j.split('\t')[8]),
                                                                 float(j.split('\t')[10])))
    else:
        greater = lambda x, y: x.split('\t', 1)[0].split('~')[0] > y.split('\t', 1)[0].split('~')[0]
    read1 = next(fh1)
    read2 = next(fh2)
    while True:
        if greater(read2, read1):
            tmp_file.write(read1)
            try:
                read1 = next(fh1)
            except StopIteration:
                tmp_file.write(read2)
                break
        else:
            tmp_file.write(read2)
            try:
                read2 = next(fh2)
            except StopIteration:
                tmp_file.write(read1)
                break
    for read in fh1:
        tmp_file.write(read)
    for read in fh2:
        tmp_file.write(read)
    fh1.close()
    fh2.close()
    tmp_file.close()
    os.system('rm -f ' + file1)
    os.system('rm -f ' + file2)
    return tmp_name