Source code for pytadbit.mapping.filter

"""
17 nov. 2014


"""
from __future__ import print_function
from builtins   import next
import multiprocessing as mu

from pytadbit.mapping.restriction_enzymes import count_re_fragments


MASKED = {1 : {'name': 'self-circle'       , 'reads': 0},
          2 : {'name': 'dangling-end'      , 'reads': 0},
          3 : {'name': 'error'             , 'reads': 0},
          4 : {'name': 'extra dangling-end', 'reads': 0},
          5 : {'name': 'too close from RES', 'reads': 0},
          6 : {'name': 'too short'         , 'reads': 0},
          7 : {'name': 'too large'         , 'reads': 0},
          8 : {'name': 'over-represented'  , 'reads': 0},
          9 : {'name': 'duplicated'        , 'reads': 0},
          10: {'name': 'random breaks'     , 'reads': 0},
          11: {'name': 'trans-chromosomic' , 'reads': 0}}


[docs]def apply_filter(fnam, outfile, masked, filters=None, reverse=False, verbose=True): """ Create a new file with reads filtered :param fnam: input file path, where non-filtered read are stored :param outfile: output file path, where filtered read will be stored :param masked: dictionary given by the :func:`pytadbit.mapping.filter.filter_reads` :param None filters: list of numbers corresponding to the filters we want to apply (numbers correspond to the keys in the masked dictionary) :param False reverse: if set, the resulting outfile will only contain the reads filtered, not the valid pairs. :param False verbose: :returns: number of reads kept """ filters = filters or list(masked.keys()) filter_handlers = {} for k in filters: try: fh = open(masked[k]['fnam']) val = next(fh).strip() filter_handlers[k] = [val, fh] except StopIteration: pass out = open(outfile, 'w') fhandler = open(fnam) # get the header pos = 0 while True: line = next(fhandler) if not line.startswith('#'): break pos += len(line) out.write(line) fhandler.seek(pos) current = set([v for v, _ in list(filter_handlers.values())]) count = 0 if reverse: for line in fhandler: read = line.split('\t', 1)[0] if read in current: count += 1 out.write(line) else: continue # iterate over different filters to update current filters for k in list(filter_handlers.keys()): if read != filter_handlers[k][0]: continue try: # get next line from filter file filter_handlers[k][0] = next(filter_handlers[k][1]).strip() except StopIteration: filter_handlers[k][1].close() del filter_handlers[k] current = set([v for v, _ in list(filter_handlers.values())]) else: for line in fhandler: read = line.split('\t', 1)[0] if read not in current: count += 1 out.write(line) continue # iterate over different filters to update current filters for k in list(filter_handlers.keys()): if read != filter_handlers[k][0]: continue try: # get next line from filter file filter_handlers[k][0] = next(filter_handlers[k][1]).strip() except StopIteration: filter_handlers[k][1].close() del filter_handlers[k] current = set([v for v, _ in list(filter_handlers.values())]) if verbose: print(' saving to file {:,} {} reads.'.format( count, 'filtered' if reverse else 'valid')) out.close() fhandler.close() return count
[docs]def filter_reads(fnam, output=None, max_molecule_length=500, over_represented=0.005, max_frag_size=100000, min_frag_size=100, re_proximity=5, verbose=True, savedata=None, min_dist_to_re=750, strict_duplicates=False, fast=True): """ Filter mapped pair of reads in order to remove experimental artifacts (e.g. dangling-ends, self-circle, PCR artifacts...) Applied filters are: 1- self-circle : reads are coming from a single RE fragment and point to the outside (----<===---===>---) 2- dangling-end : reads are coming from a single RE fragment and point to the inside (----===>---<===---) 3- error : reads are coming from a single RE fragment and point in the same direction 4- extra dangling-end : reads are coming from different RE fragment but are close enough (< max_molecule length) and point to the inside 5- too close from RES : semi-dangling-end filter, start position of one of the read is too close (5 bp by default) from RE cutting site. This filter is skipped in case read is involved in multi-contact. This filter may be too conservative for 4bp cutter REs. 6- too short : remove reads coming from small restriction less than 100 bp (default) because they are comparable to the read length 7- too large : remove reads coming from large restriction fragments (default: 100 Kb, P < 10-5 to occur in a randomized genome) as they likely represent poorly assembled or repeated regions 8- over-represented : reads coming from the top 0.5% most frequently detected restriction fragments, they may be prone to PCR artifacts or represent fragile regions of the genome or genome assembly errors 9- duplicated : the combination of the start positions of the reads is repeated -> PCR artifact (only keep one copy) 10- random breaks : start position of one of the read is too far ( more than min_dist_to_re) from RE cutting site. Non-canonical enzyme activity or random physical breakage of the chromatin. :param fnam: path to file containing the pair of reads in tsv format, file generated by :func:`pytadbit.mapping.mapper.get_intersection` :param None output: PATH where to write files containing IDs of filtered reads. Uses fnam by default. :param 500 max_molecule_length: facing reads that are within max_molecule_length, will be classified as 'extra dangling-ends' :param 0.005 over_represented: to remove the very top fragment containing more reads :param 100000 max_frag_size: maximum fragment size allowed (fragments should not span over several bins) :param 100 min_frag_size: remove fragment that are too short (shorter than the sequenced read length) :param 5 re_proximity: should be adjusted according to RE site, to filter semi-dangling-ends :param 750 min_dist_to_re: minimum distance the start of a read should be from a RE site (usually 1.5 times the insert size). Applied in filter 10 :param None savedata: PATH where to write the number of reads retained by each filter :param True fast: parallel version, requires 4 CPUs and more RAM memory :param False strict_duplicates: by default reads are considered duplicates if they coincide in genomic coordinates and strand; with strict_duplicates enabled, we also ask to consider read length (WARNING: this option is called strict, but it is more permissive). :return: dictionary with, as keys, the kind of filter applied, and as values a set of read IDs to be removed *Note: Filtering is not exclusive, one read can be filtered several times.* """ if not output: output = fnam if strict_duplicates: _filter_duplicates = _filter_duplicates_strict else: _filter_duplicates = _filter_duplicates_loose if not fast: # mainly for debugging if verbose: print('filtering duplicates') sub_mask, total = _filter_duplicates(fnam, output) MASKED.update(sub_mask) if verbose: print('filtering same fragments') MASKED.update(_filter_same_frag(fnam, max_molecule_length, output)) if verbose: print('filtering fro RE') MASKED.update(_filter_from_res(fnam, max_frag_size, min_dist_to_re, re_proximity, min_frag_size, output)) if verbose: print('filtering over represented') MASKED.update(_filter_over_represented(fnam, over_represented, output)) else: pool = mu.Pool(4) a = pool.apply_async(_filter_same_frag, args=(fnam, max_molecule_length, output)) b = pool.apply_async(_filter_from_res, args=(fnam, max_frag_size, min_dist_to_re, re_proximity, min_frag_size, output)) c = pool.apply_async(_filter_over_represented, args=(fnam, over_represented, output)) d = pool.apply_async(_filter_duplicates, args=(fnam, output)) pool.close() pool.join() sub_mask, total = d.get() MASKED.update(sub_mask) MASKED.update(b.get()) MASKED.update(c.get()) MASKED.update(a.get()) # if savedata or verbose: # bads = len(frozenset().union(*[masked[k]['reads'] for k in masked])) if savedata: out = open(savedata, 'w') out.write('Mapped both\t%d\n' % total) for k in range(1, len(MASKED) + 1): out.write('%s\t%d\n' % (MASKED[k]['name'], MASKED[k]['reads'])) # out.write('Valid pairs\t%d\n' % (total - bads)) out.close() if verbose: print('Filtered reads (and percentage of total):\n') print(' {:>25} : {:12,} (100.00%)'.format('Mapped both', total)) print(' ' + '-' * 53) for k in range(1, len(MASKED)): print(' {:2}- {:>25} : {:12,} ({:6.2f}%)'.format( k, MASKED[k]['name'], MASKED[k]['reads'], float(MASKED[k]['reads']) / total * 100)) return MASKED
def _filter_same_frag(fnam, max_molecule_length, output): # t0 = time() masked = {1 : {'name': 'self-circle' , 'reads': 0}, 2 : {'name': 'dangling-end' , 'reads': 0}, 3 : {'name': 'error' , 'reads': 0}, 4 : {'name': 'extra dangling-end', 'reads': 0}} outfil = {} for k in masked: masked[k]['fnam'] = output + '_' + masked[k]['name'].replace(' ', '_') + '.tsv' outfil[k] = open(masked[k]['fnam'], 'w') fhandler = open(fnam) line = next(fhandler) while line.startswith('#'): line = next(fhandler) try: while True: (read, cr1, pos1, sd1, _, _, re1, cr2, pos2, sd2, _, _, re2) = line.split('\t') ps1, ps2, sd1, sd2 = list(map(int, (pos1, pos2, sd1, sd2))) if cr1 == cr2: if re1 == re2.rstrip(): if sd1 != sd2: if (ps2 > ps1) == sd2: # ----<===---===>--- self-circles masked[1]["reads"] += 1 outfil[1].write(read + '\n') else: # ----===>---<===--- dangling-ends masked[2]["reads"] += 1 outfil[2].write(read + '\n') else: # --===>--===>-- or --<===--<===-- or same errors masked[3]["reads"] += 1 outfil[3].write(read + '\n') elif (abs(ps1 - ps2) < max_molecule_length and sd2 != sd1 and (ps2 > ps1) != sd2): # different fragments but facing and very close masked[4]["reads"] += 1 outfil[4].write(read + '\n') line = next(fhandler) except StopIteration: pass fhandler.close() # print 'done 1', time() - t0 for k in masked: masked[k]['fnam'] = output + '_' + masked[k]['name'].replace(' ', '_') + '.tsv' outfil[k].close() return masked def _filter_duplicates_strict(fnam, output): total = 0 masked = {9 : {'name': 'duplicated' , 'reads': 0}} outfil = {} for k in masked: masked[k]['fnam'] = output + '_' + masked[k]['name'].replace(' ', '_') + '.tsv' outfil[k] = open(masked[k]['fnam'], 'w') fhandler = open(fnam) line = next(fhandler) while line.startswith('#'): line = next(fhandler) (read, cr1, pos1, sd1, l1 , _, _, cr2, pos2, sd2, l2 , _, _) = line.split('\t') prev_elts = cr1, pos1, cr2, pos2, sd1, sd2, l1, l2 for line in fhandler: (read, cr1, pos1, sd1, l1 , _, _, cr2, pos2, sd2, l2 , _, _) = line.split('\t') new_elts = cr1, pos1, cr2, pos2, sd1, sd2, l1, l2 if prev_elts == new_elts: masked[9]["reads"] += 1 outfil[9].write(read + '\n') total += 1 prev_elts = new_elts fhandler.close() # print 'done 4', time() - t0 for k in masked: masked[k]['fnam'] = output + '_' + masked[k]['name'].replace(' ', '_') + '.tsv' outfil[k].close() return masked, total def _filter_duplicates_loose(fnam, output): total = 0 masked = {9 : {'name': 'duplicated' , 'reads': 0}} outfil = {} for k in masked: masked[k]['fnam'] = output + '_' + masked[k]['name'].replace(' ', '_') + '.tsv' outfil[k] = open(masked[k]['fnam'], 'w') fhandler = open(fnam) line = next(fhandler) while line.startswith('#'): line = next(fhandler) (read, cr1, pos1, sd1, _ , _, _, cr2, pos2, sd2, _ , _, _) = line.split('\t') prev_elts = cr1, pos1, cr2, pos2, sd1, sd2 for line in fhandler: (read, cr1, pos1, sd1, _ , _, _, cr2, pos2, sd2, _ , _, _) = line.split('\t') new_elts = cr1, pos1, cr2, pos2, sd1, sd2 if prev_elts == new_elts: masked[9]["reads"] += 1 outfil[9].write(read + '\n') total += 1 prev_elts = new_elts fhandler.close() # print 'done 4', time() - t0 for k in masked: masked[k]['fnam'] = output + '_' + masked[k]['name'].replace(' ', '_') + '.tsv' outfil[k].close() return masked, total def _filter_from_res(fnam, max_frag_size, min_dist_to_re, re_proximity, min_frag_size, output): # t0 = time() masked = {5 : {'name': 'too close from RES', 'reads': 0}, 6 : {'name': 'too short' , 'reads': 0}, 7 : {'name': 'too large' , 'reads': 0}, 10: {'name': 'random breaks' , 'reads': 0}} outfil = {} for k in masked: masked[k]['fnam'] = output + '_' + masked[k]['name'].replace(' ', '_') + '.tsv' outfil[k] = open(masked[k]['fnam'], 'w') fhandler = open(fnam) line = next(fhandler) while line.startswith('#'): line = next(fhandler) try: while True: (read, _, pos1, _, _, rs1, re1, _, pos2, _, _, rs2, re2) = line.split('\t') ps1, ps2, re1, rs1, re2, rs2 = list(map(int, (pos1, pos2, re1, rs1, re2, rs2))) diff11 = re1 - ps1 diff12 = ps1 - rs1 diff21 = re2 - ps2 diff22 = ps2 - rs2 if ((diff11 < re_proximity) or (diff12 < re_proximity) or (diff21 < re_proximity) or (diff22 < re_proximity)): # multicontacts excluded if fragment is internal (not the first) if not '~' in read: masked[5]["reads"] += 1 outfil[5].write(read + '\n') # random breaks if (((diff11 > min_dist_to_re) and (diff12 > min_dist_to_re)) or ((diff21 > min_dist_to_re) and (diff22 > min_dist_to_re))): masked[10]["reads"] += 1 outfil[10].write(read + '\n') dif1 = re1 - rs1 dif2 = re2 - rs2 if (dif1 < min_frag_size) or (dif2 < min_frag_size): masked[6]["reads"] += 1 outfil[6].write(read + '\n') if (dif1 > max_frag_size) or (dif2 > max_frag_size): masked[7]["reads"] += 1 outfil[7].write(read + '\n') line = next(fhandler) except StopIteration: pass fhandler.close() # print 'done 2', time() - t0 for k in masked: masked[k]['fnam'] = output + '_' + masked[k]['name'].replace(' ', '_') + '.tsv' outfil[k].close() return masked def _filter_over_represented(fnam, over_represented, output): # t0 = time() frag_count = count_re_fragments(fnam) num_frags = len(frag_count) cut = int((1 - over_represented) * num_frags + 0.5) # use cut-1 because it represents the length of the list cut = sorted([frag_count[crm] for crm in frag_count])[cut - 1] masked = {8 : {'name': 'over-represented' , 'reads': 0}} outfil = {} for k in masked: masked[k]['fnam'] = output + '_' + masked[k]['name'].replace(' ', '_') + '.tsv' outfil[k] = open(masked[k]['fnam'], 'w') fhandler = open(fnam) line = next(fhandler) while line.startswith('#'): line = next(fhandler) try: while True: read, cr1, _, _, _, rs1, _, cr2, _, _, _, rs2, _ = line.split('\t') if (frag_count.get((cr1, rs1), 0) > cut or frag_count.get((cr2, rs2), 0) > cut): masked[8]["reads"] += 1 outfil[8].write(read + '\n') line = next(fhandler) except StopIteration: pass fhandler.close() # print 'done 3', time() - t0 for k in masked: masked[k]['fnam'] = output + '_' + masked[k]['name'].replace(' ', '_') + '.tsv' outfil[k].close() return masked def _filter_yannick(fnam, maxlen, de_left, de_right, output): # t0 = time() masked = {11: {'name': 'Y Dangling L', 'reads': 0}, 12: {'name': 'Y Dangling R', 'reads': 0}, 13: {'name': 'Y Rejoined', 'reads': 0}, 14: {'name': 'Y Self Circle', 'reads': 0}, 15: {'name': 'Y Random Break', 'reads': 0}, 16: {'name': 'Y Contact Close', 'reads': 0}, 17: {'name': 'Y Contact Far', 'reads': 0}, 18: {'name': 'Y Contact Upstream', 'reads': 0}, 19: {'name': 'Y Contact Downstream', 'reads': 0}, 20: {'name': 'Y Other', 'reads': 0}} outfil = {} for k in masked: masked[k]['fnam'] = output + '_' + masked[k]['name'].replace(' ', '_') + '.tsv' outfil[k] = open(masked[k]['fnam'], 'w') fhandler = open(fnam) line = next(fhandler) while line.startswith('#'): line = next(fhandler) try: while True: (read, n1, pos1, strand1, _, rs1, re1, n2, pos2, strand2, _, rs2, re2) = line.split('\t') pos1, pos2, re1, rs1, re2, rs2, strand1, strand2 = list(map(int, (pos1, pos2, re1, rs1, re2, rs2, strand1, strand2))) #lexicographic order for chromosomes if n1 > n2 or (n1 == n2 and pos2<pos1): pos1,pos2,n1,n2,re1,rs1,re2,rs2,strand1,strand2 = \ pos2,pos1,n2,n1,re2,rs2,re1,rs1,strand2,strand1 closest1 = rs1 if (pos1-rs1 < re1-pos1) else re1 closest2 = rs2 if (pos2-rs2 < re2-pos2) else re2 cat=20 #fall-through is "Other" #contacts if n1 != n2 or (closest1 != closest2 and re1 != re2): if strand1==1 and re1-pos1 < maxlen and\ strand2==1 and re2-pos2 < maxlen: cat=18 elif strand1==1 and re1-pos1 < maxlen and\ strand2==0 and pos2-rs2 < maxlen and pos2-pos1>=maxlen: cat=17 elif strand1==0 and pos1-rs1 < maxlen and\ strand2==1 and re2-pos2 < maxlen: cat=16 elif strand1==0 and pos1-rs1 < maxlen and\ strand2==0 and pos2-rs2 < maxlen: cat=19 #self circles if re1 == re2 and strand1==0 and strand2==1 and pos1-rs1<maxlen\ and re2-pos2<maxlen: cat=14 #random, rejoined and dangling if n1==n2 and pos2-pos1<maxlen and strand1==1 and strand2==0: if re2==re1: cat=15 elif re1 <= rs2: cat=13 if pos1 - closest1 in de_left: cat=11 if pos2 - closest2 in de_right: cat=12 #apply classification masked[cat]["reads"] += 1 outfil[cat].write(read + '\n') line = next(fhandler) except StopIteration: pass fhandler.close() for k in masked: masked[k]['fnam'] = output + '_' +\ masked[k]['name'].replace(' ', '_') + '.tsv' outfil[k].close() return masked