Source code for pytadbit.mapping.restriction_enzymes

"""
14 nov. 2014

Definition and mapping of restriction enymes
"""
from __future__ import print_function

from re import compile
from warnings import warn

from collections import OrderedDict
from scipy.stats import binom_test

from pytadbit.utils.file_handling import magic_open

try:
    basestring
except NameError:
    basestring = str

def iupac2regex(restring):
    """
    Convert target sites with IUPAC nomenclature to regex pattern
    """
    restring = restring.replace('R', '[AG]')
    restring = restring.replace('Y', '[CT]')
    restring = restring.replace('M', '[AC]')
    restring = restring.replace('K', '[GT]')
    restring = restring.replace('S', '[CG]')
    restring = restring.replace('W', '[AT]')
    restring = restring.replace('H', '[ACT]')
    restring = restring.replace('B', '[CGT]')
    restring = restring.replace('V', '[ACG]')
    restring = restring.replace('D', '[AGT]')
    restring = restring.replace('N', '[ATGC]')
    return restring


def count_re_fragments(fnam):
    frag_count = {}
    fhandler = open(fnam)
    line = next(fhandler)
    while line.startswith('#'):
        line = next(fhandler)
    try:
        while True:
            _, cr1, _, _, _, rs1, _, cr2, _, _, _, rs2, _ = line.split('\t')
            try:
                frag_count[(cr1, rs1)] += 1
            except KeyError:
                frag_count[(cr1, rs1)] = 1
            try:
                frag_count[(cr2, rs2)] += 1
            except KeyError:
                frag_count[(cr2, rs2)] = 1
            line = next(fhandler)
    except StopIteration:
        pass
    fhandler.close()
    return frag_count


def map_re_sites_nochunk(enzyme_name, genome_seq, verbose=False):
    """
    map all restriction enzyme (RE) sites of a given enzyme in a genome.
    Position of a RE site is defined as the genomic coordinate of the first
    nucleotide after the first cut (genomic coordinate starts at 1).


    In the case of HindIII the genomic coordinate is this one:

    123456 789...
           |
           v
    -----A|AGCT T--------------
    -----T TCGA|A--------------

    In this example the coordinate of the RE site would be 7.


    :param enzyme_name: name of the enzyme to map (upper/lower case are
       important)
    :param genome_seq: a dictionary containing the genomic sequence by
       chromosome
    """
    warn('WARNING: not reviewed since multiple-cut branch, and the use of regexpinstead of index')
    if isinstance(enzyme_name, basestring):
        enzyme_names = [enzyme_name]
    elif isinstance(enzyme_name, list):
        enzyme_names = enzyme_name
    enzymes = {}
    for name in enzyme_names:
        enzymes[name] = RESTRICTION_ENZYMES[name]

    # we match the full cut-site but report the position after the cut site
    # (third group of the regexp)
    restring = ('%s') % ('|'.join(['(?<=%s(?=%s))' % tuple(enzymes[n].split('|'))
                                   for n in enzymes]))
    # IUPAC conventions
    restring = iupac2regex(restring)

    enz_pattern = compile(restring)

    frags = {}
    count = 0
    for crm in genome_seq:
        seq = genome_seq[crm]
        frags[crm] = [1]
        for match in enz_pattern.finditer(seq):
            pos = match.end() + 1
            frags[crm].append(pos)
            count += 1
        # at the end of last chunk we add the chromosome length
        frags[crm].append(len(seq))
    if verbose:
        print('Found %d RE sites' % count)
    return frags


[docs]def map_re_sites(enzyme_name, genome_seq, frag_chunk=100000, verbose=False): """ map all restriction enzyme (RE) sites of a given enzyme in a genome. Position of a RE site is defined as the genomic coordinate of the first nucleotide after the first cut (genomic coordinate starts at 1). In the case of HindIII the genomic coordinate is this one: 123456 789... | v -----A|AGCT T-------------- -----T TCGA|A-------------- In this example the coordinate of the RE site would be 7. :param enzyme_name: name of the enzyme to map (upper/lower case are important) :param genome_seq: a dictionary containing the genomic sequence by chromosome :param 100000 frag_chunk: in order to optimize the search for nearby RE sites, each chromosome is splitted into chunks. """ if isinstance(enzyme_name, basestring): enzyme_names = [enzyme_name] elif isinstance(enzyme_name, list): enzyme_names = enzyme_name enzymes = {} for name in enzyme_names: enzymes[name] = RESTRICTION_ENZYMES[name] # we match the full cut-site but report the position after the cut site # (third group of the regexp) restring = ('%s') % ('|'.join(['(?<=%s(?=%s))' % tuple(enzymes[n].split('|')) for n in enzymes])) # IUPAC conventions restring = iupac2regex(restring) enz_pattern = compile(restring) frags = {} count = 0 for crm in genome_seq: seq = genome_seq[crm] frags[crm] = dict([(i, []) for i in range(int(len(seq) // frag_chunk + 1))]) frags[crm][0] = [1] for match in enz_pattern.finditer(seq): pos = match.end() + 1 frags[crm][pos // frag_chunk].append(pos) count += 1 # at the end of last chunk we add the chromosome length frags[crm][len(seq) // frag_chunk].append(len(seq)) # now we need to assign as first RE site of a fragment the last RE site # of previsou fragment, and as last RE site, the first RE site of the # next fragment. for i in range(int(len(seq) // frag_chunk + 1)): try: try: frags[crm][i].insert(0, frags[crm][i - 1][-2]) except IndexError: # in case there was no RE site in previous fragment frags[crm][i].insert(0, frags[crm][i - 1][-1]) except KeyError: # it is the very first chunk pass plus = 1 while True: try: frags[crm][i].append(frags[crm][i + plus][0]) break except IndexError: # no RE site in this fragment, get "next RE site" from next plus += 1 except KeyError: # end of the chromosome break if verbose: print('Found %d RE sites' % count) return frags
def complementary(seq): trs = dict([(nt1, nt2) for nt1, nt2 in zip('ATGCN', 'TACGN')]) return ''.join([trs[s] for s in seq[::-1]])
[docs]def repaired(r_enz): """ returns the resulting sequence after reparation of two digested and repaired ends, marking dangling ends. """ site = RESTRICTION_ENZYMES[r_enz] beg, end = site.split('|') site = site.replace('|', '') return complementary(beg + site[min(len(beg), len(end)) : max(len(beg), len(end))])
def religateds(r_enzs): """ returns the resulting list of all possible sequences after religation of two digested and repaired ends. """ ligations = OrderedDict() for r_enz1 in r_enzs: for r_enz2 in r_enzs: site1 = RESTRICTION_ENZYMES[r_enz1] site2 = RESTRICTION_ENZYMES[r_enz2] beg1, end1 = site1.split('|') _, end2 = site2.split('|') site1 = site1.replace('|', '') site2 = site2.replace('|', '') ligations[(r_enz1, r_enz2)] = beg1 + end1[:len(end1)-len(beg1)] + end2 return ligations def identify_re(fnam, nreads=100000): """ Search most probable restriction enzyme used in the Hi-C experiment. Uses binomial test and some heuristics. :param fnam: path to FASTQ file :param 100000 nreads: number of reads to use for prediction, the more the better (and slower) :returns: 1- most probable pattern, 2- the corresponding list of restriction enzyme names and 3- p-value of binomial test (which is not necessarily meaningful). """ pats = {} for k in RESTRICTION_ENZYMES: pat = RESTRICTION_ENZYMES[k].split('|')[1] if len(pat) < 1: continue pats.setdefault(pat, {'name': [], 're': compile(pat), 'count': 0}) pats[pat]['name'].append(k) fh = magic_open(fnam) for _ in range(nreads): _ = next(fh) s = next(fh)[:14] _ = next(fh) _ = next(fh) for pat in pats: m = pats[pat]['re'].match(s) if m and m.start() == 0: pats[pat]['count'] += 1 bestks = [] best_pv = 1 for pv, k in sorted((binom_test(pats[k]['count'], nreads, 0.25**len(k), alternative='greater'), k) for k in pats if pats[k]['count'])[:20]: if pv <= best_pv or pv < 1e-100: if pv <= best_pv: best_pv = pv bestks.append(k) # some times several patterns are equally probable (pv=0.0), in this case # we take the longest pattern that has a unique length. if len(bestks) > 1: lens = [len(k) for k in bestks] lens = dict((l, lens.count(l)) for l in set(lens)) bestk = max([k for k in bestks if lens[len(k)] == 1], key=len) else: bestk = bestks[0] return bestk, pats[bestk]['name'], best_pv class RE_dict(dict): def __getitem__(self, i): try: return super(RE_dict, self).__getitem__(i) except KeyError: for nam in self: if nam.lower() == i.lower(): return self[nam] raise KeyError('Restriction Enzyme %s not found\n' % (i)) RESTRICTION_ENZYMES = RE_dict([('AanI' , 'TTA|TAA' ), ('AarI' , 'CACCTGC|' ), ('AasI' , 'GACNNNN|NNGTC' ), ('AatII' , 'GACGT|C' ), ('AbaSI' , 'C|' ), ('AbsI' , 'CC|TCGAGG' ), ('Acc16I' , 'TGC|GCA' ), ('Acc36I' , 'ACCTGC|' ), ('Acc65I' , 'G|GTACC' ), ('AccB1I' , 'G|GYRCC' ), ('AccB7I' , 'CCANNNN|NTGG' ), ('AccBSI' , 'CCG|CTC' ), ('AccI' , 'GT|MKAC' ), ('AccII' , 'CG|CG' ), ('AccIII' , 'T|CCGGA' ), ('AceIII' , 'CAGCTC|' ), ('AciI' , 'C|CGC' ), ('AclI' , 'AA|CGTT' ), ('AclWI' , 'GGATC|' ), ('AcoI' , 'Y|GGCCR' ), ('AcsI' , 'R|AATTY' ), ('AcuI' , 'CTGAAG|' ), ('AcvI' , 'CAC|GTG' ), ('AcyI' , 'GR|CGYC' ), ('AdeI' , 'CACNNN|GTG' ), ('AfaI' , 'GT|AC' ), ('AfeI' , 'AGC|GCT' ), ('AfiI' , 'CCNNNNN|NNGG' ), ('AflII' , 'C|TTAAG' ), ('AflIII' , 'A|CRYGT' ), ('AgeI' , 'A|CCGGT' ), ('AgsI' , 'TTS|AA' ), ('AhaIII' , 'TTT|AAA' ), ('AhdI' , 'GACNNN|NNGTC' ), ('AhlI' , 'A|CTAGT' ), ('AjiI' , 'CAC|GTC' ), ('AjnI' , '|CCWGG' ), ('AjuI' , 'GAANNNN|NNNTTGG' ), ('AleI' , 'CACNN|NNGTG' ), ('AlfI' , 'GC|ANNNNNNTGC' ), ('AloI' , 'GAACNN|NNNNTCC' ), ('AluBI' , 'AG|CT' ), ('AluI' , 'AG|CT' ), ('Alw21I' , 'GWGCW|C' ), ('Alw26I' , 'GTCTC|' ), ('Alw44I' , 'G|TGCAC' ), ('AlwFI' , 'GAAAYNNNNNRTG|GAAAYNNNNNRTG' ), ('AlwI' , 'GGATC|' ), ('AlwNI' , 'CAGNNN|CTG' ), ('Ama87I' , 'C|YCGRG' ), ('Aor13HI' , 'T|CCGGA' ), ('Aor51HI' , 'AGC|GCT' ), ('AoxI' , '|GGCC' ), ('ApaBI' , 'GCANNNNN|TGC' ), ('ApaI' , 'GGGCC|C' ), ('ApaLI' , 'G|TGCAC' ), ('ApeKI' , 'G|CWGC' ), ('ApoI' , 'R|AATTY' ), ('ApyPI' , 'ATCGAC|' ), ('AquII' , 'GCCGNAC|' ), ('AquIII' , 'GAGGAG|' ), ('AquIV' , 'GRGGAAG|' ), ('ArsI' , 'GACNN|NNNNTTYG' ), ('AscI' , 'GG|CGCGCC' ), ('AseI' , 'AT|TAAT' ), ('Asi256I' , 'G|ATC' ), ('AsiGI' , 'A|CCGGT' ), ('AsiSI' , 'GCGAT|CGC' ), ('Asp700I' , 'GAANN|NNTTC' ), ('Asp718I' , 'G|GTACC' ), ('AspA2I' , 'C|CTAGG' ), ('AspBHI' , 'YSCNS|' ), ('AspLEI' , 'GCG|C' ), ('AspS9I' , 'G|GNCC' ), ('AssI' , 'AGT|ACT' ), ('AsuC2I' , 'CC|SGG' ), ('AsuHPI' , 'GGTGA|' ), ('AsuI' , 'G|GNCC' ), ('AsuII' , 'TT|CGAA' ), ('AsuNHI' , 'G|CTAGC' ), ('AvaI' , 'C|YCGRG' ), ('AvaII' , 'G|GWCC' ), ('AvaIII' , 'ATGCAT|ATGCAT' ), ('AvrII' , 'C|CTAGG' ), ('AxyI' , 'CC|TNAGG' ), ('BaeGI' , 'GKGCM|C' ), ('BaeI' , 'A|CNNNNGTAYC' ), ('BalI' , 'TGG|CCA' ), ('BamHI' , 'G|GATCC' ), ('BanI' , 'G|GYRCC' ), ('BanII' , 'GRGCY|C' ), ('BarI' , 'GAAGNN|NNNNTAC' ), ('BasI' , 'CCANNNN|NTGG' ), ('BauI' , 'C|ACGAG' ), ('Bbr7I' , 'GAAGAC|' ), ('BbrPI' , 'CAC|GTG' ), ('BbsI' , 'GAAGAC|' ), ('Bbv12I' , 'GWGCW|C' ), ('BbvCI' , 'CC|TCAGC' ), ('BbvI' , 'GCAGC|' ), ('BbvII' , 'GAAGAC|' ), ('BccI' , 'CCATC|' ), ('Bce83I' , 'CTTGAG|' ), ('BceAI' , 'ACGGC|' ), ('BcefI' , 'ACGGC|' ), ('BcgI' , 'CG|ANNNNNNTGC' ), ('BciT130I' , 'CC|WGG' ), ('BciVI' , 'GTATCC|' ), ('BclI' , 'T|GATCA' ), ('BcnI' , 'CC|SGG' ), ('BcoDI' , 'GTCTC|' ), ('BcuI' , 'A|CTAGT' ), ('BdaI' , 'TG|ANNNNNNTCA' ), ('BetI' , 'W|CCGGW' ), ('BfaI' , 'C|TAG' ), ('BfiI' , 'ACTGGG|' ), ('BfmI' , 'C|TRYAG' ), ('BfoI' , 'RGCGC|Y' ), ('BfrI' , 'C|TTAAG' ), ('BfuAI' , 'ACCTGC|' ), ('BfuCI' , '|GATC' ), ('BfuI' , 'GTATCC|' ), ('BglI' , 'GCCNNNN|NGGC' ), ('BglII' , 'A|GATCT' ), ('BinI' , 'GGATC|' ), ('BisI' , 'GC|NGC' ), ('BlnI' , 'C|CTAGG' ), ('BlpI' , 'GC|TNAGC' ), ('BlsI' , 'GCN|GC' ), ('BmcAI' , 'AGT|ACT' ), ('Bme1390I' , 'CC|NGG' ), ('Bme18I' , 'G|GWCC' ), ('BmeDI' , 'C|' ), ('BmeRI' , 'GACNNN|NNGTC' ), ('BmeT110I' , 'C|YCGRG' ), ('BmgBI' , 'CAC|GTC' ), ('BmgI' , 'GKGCCC|GKGCCC' ), ('BmgT120I' , 'G|GNCC' ), ('BmiI' , 'GGN|NCC' ), ('BmrFI' , 'CC|NGG' ), ('BmrI' , 'ACTGGG|' ), ('BmsI' , 'GCATC|' ), ('BmtI' , 'GCTAG|C' ), ('BmuI' , 'ACTGGG|' ), ('BoxI' , 'GACNN|NNGTC' ), ('BpiI' , 'GAAGAC|' ), ('BplI' , 'GAG|NNNNNCTC' ), ('BpmI' , 'CTGGAG|' ), ('Bpu10I' , 'CC|TNAGC' ), ('Bpu1102I' , 'GC|TNAGC' ), ('Bpu14I' , 'TT|CGAA' ), ('BpuEI' , 'CTTGAG|' ), ('BpuMI' , 'CC|SGG' ), ('BpvUI' , 'CGAT|CG' ), ('Bsa29I' , 'AT|CGAT' ), ('BsaAI' , 'YAC|GTR' ), ('BsaBI' , 'GATNN|NNATC' ), ('BsaHI' , 'GR|CGYC' ), ('BsaI' , 'GGTCTC|' ), ('BsaJI' , 'C|CNNGG' ), ('BsaWI' , 'W|CCGGW' ), ('BsaXI' , 'AC|NNNNNCTCC' ), ('BsbI' , 'CAACAC|' ), ('Bsc4I' , 'CCNNNNN|NNGG' ), ('BscAI' , 'GCATC|' ), ('BscGI' , 'CCCGT|CCCGT' ), ('Bse118I' , 'R|CCGGY' ), ('Bse1I' , 'ACTGG|' ), ('Bse21I' , 'CC|TNAGG' ), ('Bse3DI' , 'GCAATG|' ), ('Bse8I' , 'GATNN|NNATC' ), ('BseAI' , 'T|CCGGA' ), ('BseBI' , 'CC|WGG' ), ('BseCI' , 'AT|CGAT' ), ('BseDI' , 'C|CNNGG' ), ('BseGI' , 'GGATG|' ), ('BseJI' , 'GATNN|NNATC' ), ('BseLI' , 'CCNNNNN|NNGG' ), ('BseMI' , 'GCAATG|' ), ('BseMII' , 'CTCAG|' ), ('BseNI' , 'ACTGG|' ), ('BsePI' , 'G|CGCGC' ), ('BseRI' , 'GAGGAG|' ), ('BseSI' , 'GKGCM|C' ), ('BseX3I' , 'C|GGCCG' ), ('BseXI' , 'GCAGC|' ), ('BseYI' , 'C|CCAGC' ), ('BsgI' , 'GTGCAG|' ), ('Bsh1236I' , 'CG|CG' ), ('Bsh1285I' , 'CGRY|CG' ), ('BshFI' , 'GG|CC' ), ('BshNI' , 'G|GYRCC' ), ('BshTI' , 'A|CCGGT' ), ('BshVI' , 'AT|CGAT' ), ('BsiEI' , 'CGRY|CG' ), ('BsiHKAI' , 'GWGCW|C' ), ('BsiHKCI' , 'C|YCGRG' ), ('BsiI' , 'C|ACGAG' ), ('BsiSI' , 'C|CGG' ), ('BsiWI' , 'C|GTACG' ), ('BsiYI' , 'CCNNNNN|NNGG' ), ('BslFI' , 'GGGAC|' ), ('BslI' , 'CCNNNNN|NNGG' ), ('BsmAI' , 'GTCTC|' ), ('BsmBI' , 'CGTCTC|' ), ('BsmFI' , 'GGGAC|' ), ('BsmI' , 'GAATGC|' ), ('BsnI' , 'GG|CC' ), ('Bso31I' , 'GGTCTC|' ), ('BsoBI' , 'C|YCGRG' ), ('Bsp119I' , 'TT|CGAA' ), ('Bsp120I' , 'G|GGCCC' ), ('Bsp1286I' , 'GDGCH|C' ), ('Bsp13I' , 'T|CCGGA' ), ('Bsp1407I' , 'T|GTACA' ), ('Bsp143I' , '|GATC' ), ('Bsp1720I' , 'GC|TNAGC' ), ('Bsp19I' , 'C|CATGG' ), ('Bsp24I' , 'GACN|NNNNNTGG' ), ('Bsp68I' , 'TCG|CGA' ), ('BspACI' , 'C|CGC' ), ('BspCNI' , 'CTCAG|' ), ('BspD6I' , 'GACTC|' ), ('BspDI' , 'AT|CGAT' ), ('BspEI' , 'T|CCGGA' ), ('BspFNI' , 'CG|CG' ), ('BspGI' , 'CTGGAC|CTGGAC' ), ('BspHI' , 'T|CATGA' ), ('BspLI' , 'GGN|NCC' ), ('BspLU11I' , 'A|CATGT' ), ('BspMI' , 'ACCTGC|' ), ('BspMII' , 'T|CCGGA' ), ('BspNCI' , 'CCAGA|CCAGA' ), ('BspOI' , 'GCTAG|C' ), ('BspPI' , 'GGATC|' ), ('BspQI' , 'GCTCTTC|' ), ('BspT104I' , 'TT|CGAA' ), ('BspT107I' , 'G|GYRCC' ), ('BspTI' , 'C|TTAAG' ), ('BsrBI' , 'CCG|CTC' ), ('BsrDI' , 'GCAATG|' ), ('BsrFI' , 'R|CCGGY' ), ('BsrGI' , 'T|GTACA' ), ('BsrI' , 'ACTGG|' ), ('BsrSI' , 'ACTGG|' ), ('BssAI' , 'R|CCGGY' ), ('BssECI' , 'C|CNNGG' ), ('BssHII' , 'G|CGCGC' ), ('BssKI' , '|CCNGG' ), ('BssMI' , '|GATC' ), ('BssNAI' , 'GTA|TAC' ), ('BssNI' , 'GR|CGYC' ), ('BssSI' , 'C|ACGAG' ), ('BssT1I' , 'C|CWWGG' ), ('Bst1107I' , 'GTA|TAC' ), ('Bst2BI' , 'C|ACGAG' ), ('Bst2UI' , 'CC|WGG' ), ('Bst4CI' , 'ACN|GT' ), ('Bst6I' , 'CTCTTC|' ), ('BstACI' , 'GR|CGYC' ), ('BstAFI' , 'C|TTAAG' ), ('BstAPI' , 'GCANNNN|NTGC' ), ('BstAUI' , 'T|GTACA' ), ('BstBAI' , 'YAC|GTR' ), ('BstBI' , 'TT|CGAA' ), ('BstC8I' , 'GCN|NGC' ), ('BstDEI' , 'C|TNAG' ), ('BstDSI' , 'C|CRYGG' ), ('BstEII' , 'G|GTNACC' ), ('BstENI' , 'CCTNN|NNNAGG' ), ('BstF5I' , 'GGATG|' ), ('BstFNI' , 'CG|CG' ), ('BstH2I' , 'RGCGC|Y' ), ('BstHHI' , 'GCG|C' ), ('BstKTI' , 'GAT|C' ), ('BstMAI' , 'GTCTC|' ), ('BstMBI' , '|GATC' ), ('BstMCI' , 'CGRY|CG' ), ('BstMWI' , 'GCNNNNN|NNGC' ), ('BstNI' , 'CC|WGG' ), ('BstNSI' , 'RCATG|Y' ), ('BstOI' , 'CC|WGG' ), ('BstPAI' , 'GACNN|NNGTC' ), ('BstPI' , 'G|GTNACC' ), ('BstSCI' , '|CCNGG' ), ('BstSFI' , 'C|TRYAG' ), ('BstSLI' , 'GKGCM|C' ), ('BstSNI' , 'TAC|GTA' ), ('BstUI' , 'CG|CG' ), ('BstV1I' , 'GCAGC|' ), ('BstV2I' , 'GAAGAC|' ), ('BstX2I' , 'R|GATCY' ), ('BstXI' , 'CCANNNNN|NTGG' ), ('BstYI' , 'R|GATCY' ), ('BstZ17I' , 'GTA|TAC' ), ('BstZI' , 'C|GGCCG' ), ('Bsu15I' , 'AT|CGAT' ), ('Bsu36I' , 'CC|TNAGG' ), ('BsuI' , 'GTATCC|' ), ('BsuRI' , 'GG|CC' ), ('BtgI' , 'C|CRYGG' ), ('BtgZI' , 'GCGATG|' ), ('BthCI' , 'GCNG|C' ), ('BtrI' , 'CAC|GTC' ), ('BtsCI' , 'GGATG|' ), ('BtsI' , 'GCAGTG|' ), ('BtsIMutI' , 'CAGTG|' ), ('BtuMI' , 'TCG|CGA' ), ('BveI' , 'ACCTGC|' ), ('Cac8I' , 'GCN|NGC' ), ('CaiI' , 'CAGNNN|CTG' ), ('CauII' , 'CC|SGG' ), ('CchII' , 'GGARGA|' ), ('CchIII' , 'CCCAAG|' ), ('CciI' , 'T|CATGA' ), ('CciNI' , 'GC|GGCCGC' ), ('Cdi630V' , 'CAAAAA|CAAAAA' ), ('CdiI' , 'CATC|G' ), ('CdpI' , 'GCGGAG|' ), ('CfoI' , 'GCG|C' ), ('Cfr10I' , 'R|CCGGY' ), ('Cfr13I' , 'G|GNCC' ), ('Cfr42I' , 'CCGC|GG' ), ('Cfr9I' , 'C|CCGGG' ), ('CfrI' , 'Y|GGCCR' ), ('Cgl13032I' , 'GGCGCA|GGCGCA' ), ('Cgl13032II' , 'ACGABGG|ACGABGG' ), ('ChaI' , 'GATC|' ), ('CjeFIII' , 'GCAAGG|GCAAGG' ), ('CjeFV' , 'GGRCA|GGRCA' ), ('CjeI' , 'CCA|NNNNNNGT' ), ('CjeNII' , 'GAGNNNNNGT|GAGNNNNNGT' ), ('CjeNIII' , 'GKAAYG|' ), ('CjeP659IV' , 'CACNNNNNNNGAA|CACNNNNNNNGAA' ), ('CjePI' , 'CCANN|NNNNNTC' ), ('CjuI' , 'CAYNNNNNRTG|CAYNNNNNRTG' ), ('CjuII' , 'CAYNNNNNCTC|CAYNNNNNCTC' ), ('ClaI' , 'AT|CGAT' ), ('CpoI' , 'CG|GWCCG' ), ('CseI' , 'GACGC|' ), ('CsiI' , 'A|CCWGGT' ), ('Csp6I' , 'G|TAC' ), ('CspAI' , 'A|CCGGT' ), ('CspCI' , 'C|AANNNNNGTGG' ), ('CspI' , 'CG|GWCCG' ), ('CstMI' , 'AAGGAG|' ), ('CviAII' , 'C|ATG' ), ('CviJI' , 'RG|CY' ), ('CviKI_1' , 'RG|CY' ), ('CviQI' , 'G|TAC' ), ('CviRI' , 'TG|CA' ), ('DdeI' , 'C|TNAG' ), ('DinI' , 'GGC|GCC' ), ('DpnI' , 'GA|TC' ), ('DpnII' , '|GATC' ), ('DraI' , 'TTT|AAA' ), ('DraII' , 'RG|GNCCY' ), ('DraIII' , 'CACNNN|GTG' ), ('DraRI' , 'CAAGNAC|' ), ('DrdI' , 'GACNNNN|NNGTC' ), ('DrdII' , 'GAACCA|GAACCA' ), ('DriI' , 'GACNNN|NNGTC' ), ('DsaI' , 'C|CRYGG' ), ('DseDI' , 'GACNNNN|NNGTC' ), ('EaeI' , 'Y|GGCCR' ), ('EagI' , 'C|GGCCG' ), ('Eam1104I' , 'CTCTTC|' ), ('Eam1105I' , 'GACNNN|NNGTC' ), ('EarI' , 'CTCTTC|' ), ('EciI' , 'GGCGGA|' ), ('Ecl136II' , 'GAG|CTC' ), ('EclXI' , 'C|GGCCG' ), ('Eco105I' , 'TAC|GTA' ), ('Eco130I' , 'C|CWWGG' ), ('Eco147I' , 'AGG|CCT' ), ('Eco24I' , 'GRGCY|C' ), ('Eco31I' , 'GGTCTC|' ), ('Eco32I' , 'GAT|ATC' ), ('Eco47I' , 'G|GWCC' ), ('Eco47III' , 'AGC|GCT' ), ('Eco52I' , 'C|GGCCG' ), ('Eco53kI' , 'GAG|CTC' ), ('Eco57I' , 'CTGAAG|' ), ('Eco57MI' , 'CTGRAG|' ), ('Eco72I' , 'CAC|GTG' ), ('Eco81I' , 'CC|TNAGG' ), ('Eco88I' , 'C|YCGRG' ), ('Eco91I' , 'G|GTNACC' ), ('EcoHI' , '|CCSGG' ), ('EcoICRI' , 'GAG|CTC' ), ('EcoNI' , 'CCTNN|NNNAGG' ), ('EcoO109I' , 'RG|GNCCY' ), ('EcoO65I' , 'G|GTNACC' ), ('EcoRI' , 'G|AATTC' ), ('EcoRII' , '|CCWGG' ), ('EcoRV' , 'GAT|ATC' ), ('EcoT14I' , 'C|CWWGG' ), ('EcoT22I' , 'ATGCA|T' ), ('EcoT38I' , 'GRGCY|C' ), ('EgeI' , 'GGC|GCC' ), ('EheI' , 'GGC|GCC' ), ('ErhI' , 'C|CWWGG' ), ('EsaBC3I' , 'TC|GA' ), ('EsaSSI' , 'GACCAC|GACCAC' ), ('Esp3I' , 'CGTCTC|' ), ('EspI' , 'GC|TNAGC' ), ('FaeI' , 'CATG|' ), ('FaiI' , 'YA|TR' ), ('FalI' , 'AAG|NNNNNCTT' ), ('FaqI' , 'GGGAC|' ), ('FatI' , '|CATG' ), ('FauI' , 'CCCGC|' ), ('FauNDI' , 'CA|TATG' ), ('FbaI' , 'T|GATCA' ), ('FblI' , 'GT|MKAC' ), ('FinI' , 'GGGAC|GGGAC' ), ('FmuI' , 'GGNC|C' ), ('Fnu4HI' , 'GC|NGC' ), ('FnuDII' , 'CG|CG' ), ('FokI' , 'GGATG|' ), ('FriOI' , 'GRGCY|C' ), ('FseI' , 'GGCCGG|CC' ), ('Fsp4HI' , 'GC|NGC' ), ('FspAI' , 'RTGC|GCAY' ), ('FspBI' , 'C|TAG' ), ('FspEI' , 'CC|' ), ('FspI' , 'TGC|GCA' ), ('GauT27I' , 'CGCGCAGG|CGCGCAGG' ), ('GdiII' , 'C|GGCCR' ), ('GlaI' , 'GC|GC' ), ('GluI' , 'GC|NGC' ), ('GsaI' , 'CCCAG|C' ), ('GsuI' , 'CTGGAG|' ), ('HaeI' , 'WGG|CCW' ), ('HaeII' , 'RGCGC|Y' ), ('HaeIII' , 'GG|CC' ), ('HapII' , 'C|CGG' ), ('HauII' , 'TGGCCA|' ), ('HgaI' , 'GACGC|' ), ('HgiAI' , 'GWGCW|C' ), ('HgiCI' , 'G|GYRCC' ), ('HgiEII' , 'ACCNNNNNNGGT|ACCNNNNNNGGT' ), ('HgiJII' , 'GRGCY|C' ), ('HhaI' , 'GCG|C' ), ('Hin1I' , 'GR|CGYC' ), ('Hin1II' , 'CATG|' ), ('Hin4I' , 'GAY|NNNNNVTC' ), ('Hin4II' , 'CCTTC|' ), ('Hin6I' , 'G|CGC' ), ('HinP1I' , 'G|CGC' ), ('HincII' , 'GTY|RAC' ), ('HindII' , 'GTY|RAC' ), ('HindIII' , 'A|AGCTT' ), ('HinfI' , 'G|ANTC' ), ('HpaI' , 'GTT|AAC' ), ('HpaII' , 'C|CGG' ), ('HphI' , 'GGTGA|' ), ('Hpy166II' , 'GTN|NAC' ), ('Hpy178III' , 'TC|NNGA' ), ('Hpy188I' , 'TCN|GA' ), ('Hpy188III' , 'TC|NNGA' ), ('Hpy8I' , 'GTN|NAC' ), ('Hpy99I' , 'CGWCG|' ), ('Hpy99XIII' , 'GCCTA|GCCTA' ), ('Hpy99XIV' , 'GGWTAA|GGWTAA' ), ('HpyAV' , 'CCTTC|' ), ('HpyCH4III' , 'ACN|GT' ), ('HpyCH4IV' , 'A|CGT' ), ('HpyCH4V' , 'TG|CA' ), ('HpyF10VI' , 'GCNNNNN|NNGC' ), ('HpyF3I' , 'C|TNAG' ), ('HpySE526I' , 'A|CGT' ), ('Hsp92I' , 'GR|CGYC' ), ('Hsp92II' , 'CATG|' ), ('HspAI' , 'G|CGC' ), ('Jma19592I' , 'GTATNAC|GTATNAC' ), ('KasI' , 'G|GCGCC' ), ('KflI' , 'GG|GWCCC' ), ('Kpn2I' , 'T|CCGGA' ), ('KpnI' , 'GGTAC|C' ), ('KroI' , 'G|CCGGC' ), ('Ksp22I' , 'T|GATCA' ), ('Ksp632I' , 'CTCTTC|' ), ('KspAI' , 'GTT|AAC' ), ('KspI' , 'CCGC|GG' ), ('Kzo9I' , '|GATC' ), ('LguI' , 'GCTCTTC|' ), ('LpnI' , 'RGC|GCY' ), ('LpnPI' , 'CCDG|' ), ('Lsp1109I' , 'GCAGC|' ), ('LweI' , 'GCATC|' ), ('MabI' , 'A|CCWGGT' ), ('MaeI' , 'C|TAG' ), ('MaeII' , 'A|CGT' ), ('MaeIII' , '|GTNAC' ), ('MalI' , 'GA|TC' ), ('MaqI' , 'CRTTGAC|' ), ('MauBI' , 'CG|CGCGCG' ), ('MbiI' , 'CCG|CTC' ), ('MboI' , '|GATC' ), ('MboII' , 'GAAGA|' ), ('McaTI' , 'GCGC|GC' ), ('McrI' , 'CGRY|CG' ), ('MfeI' , 'C|AATTG' ), ('MflI' , 'R|GATCY' ), ('MhlI' , 'GDGCH|C' ), ('MjaIV' , 'GTNNAC|GTNNAC' ), ('MkaDII' , 'GAGAYGT|GAGAYGT' ), ('MlsI' , 'TGG|CCA' ), ('MluCI' , '|AATT' ), ('MluI' , 'A|CGCGT' ), ('MluNI' , 'TGG|CCA' ), ('Mly113I' , 'GG|CGCC' ), ('MlyI' , 'GAGTC|' ), ('MmeI' , 'TCCRAC|' ), ('MnlI' , 'CCTC|' ), ('Mph1103I' , 'ATGCA|T' ), ('MreI' , 'CG|CCGGCG' ), ('MroI' , 'T|CCGGA' ), ('MroNI' , 'G|CCGGC' ), ('MroXI' , 'GAANN|NNTTC' ), ('MscI' , 'TGG|CCA' ), ('MseI' , 'T|TAA' ), ('MslI' , 'CAYNN|NNRTG' ), ('Msp20I' , 'TGG|CCA' ), ('MspA1I' , 'CMG|CKG' ), ('MspCI' , 'C|TTAAG' ), ('MspI' , 'C|CGG' ), ('MspJI' , 'CNNR|' ), ('MspR9I' , 'CC|NGG' ), ('MssI' , 'GTTT|AAAC' ), ('MstI' , 'TGC|GCA' ), ('MunI' , 'C|AATTG' ), ('Mva1269I' , 'GAATGC|' ), ('MvaI' , 'CC|WGG' ), ('MvnI' , 'CG|CG' ), ('MvrI' , 'CGAT|CG' ), ('MwoI' , 'GCNNNNN|NNGC' ), ('NaeI' , 'GCC|GGC' ), ('NarI' , 'GG|CGCC' ), ('NciI' , 'CC|SGG' ), ('NcoI' , 'C|CATGG' ), ('NdeI' , 'CA|TATG' ), ('NdeII' , '|GATC' ), ('NgoAVIII' , '|GACNNNNNTGA' ), ('NgoMIV' , 'G|CCGGC' ), ('NhaXI' , 'CAAGRAG|CAAGRAG' ), ('NheI' , 'G|CTAGC' ), ('NlaCI' , 'CATCAC|' ), ('NlaIII' , 'CATG|' ), ('NlaIV' , 'GGN|NCC' ), ('Nli3877I' , 'CYCGR|G' ), ('NmeAIII' , 'GCCGAG|' ), ('NmeDI' , '|RCCGGY' ), ('NmuCI' , '|GTSAC' ), ('NotI' , 'GC|GGCCGC' ), ('NruI' , 'TCG|CGA' ), ('NsbI' , 'TGC|GCA' ), ('NsiI' , 'ATGCA|T' ), ('NspBII' , 'CMG|CKG' ), ('NspI' , 'RCATG|Y' ), ('NspV' , 'TT|CGAA' ), ('OliI' , 'CACNN|NNGTG' ), ('PabI' , 'GTA|C' ), ('PacI' , 'TTAAT|TAA' ), ('PaeI' , 'GCATG|C' ), ('PaeR7I' , 'C|TCGAG' ), ('PagI' , 'T|CATGA' ), ('PalAI' , 'GG|CGCGCC' ), ('PasI' , 'CC|CWGGG' ), ('PauI' , 'G|CGCGC' ), ('PceI' , 'AGG|CCT' ), ('PciI' , 'A|CATGT' ), ('PciSI' , 'GCTCTTC|' ), ('PcsI' , 'WCGNNNN|NNNCGW' ), ('PctI' , 'GAATGC|' ), ('PdiI' , 'GCC|GGC' ), ('PdmI' , 'GAANN|NNTTC' ), ('PenI' , 'GCAGT|GCAGT' ), ('PfeI' , 'G|AWTC' ), ('Pfl1108I' , 'TCGTAG|TCGTAG' ), ('Pfl23II' , 'C|GTACG' ), ('PflFI' , 'GACN|NNGTC' ), ('PflMI' , 'CCANNNN|NTGG' ), ('PfoI' , 'T|CCNGGA' ), ('PinAI' , 'A|CCGGT' ), ('PlaDI' , 'CATCAG|' ), ('Ple19I' , 'CGAT|CG' ), ('PleI' , 'GAGTC|' ), ('PluTI' , 'GGCGC|C' ), ('PmaCI' , 'CAC|GTG' ), ('PmeI' , 'GTTT|AAAC' ), ('PmlI' , 'CAC|GTG' ), ('PpiI' , 'GAACN|NNNNCTC' ), ('PpsI' , 'GAGTC|' ), ('Ppu10I' , 'A|TGCAT' ), ('Ppu21I' , 'YAC|GTR' ), ('PpuMI' , 'RG|GWCCY' ), ('PscI' , 'A|CATGT' ), ('PshAI' , 'GACNN|NNGTC' ), ('PshBI' , 'AT|TAAT' ), ('PsiI' , 'TTA|TAA' ), ('Psp03I' , 'GGWC|C' ), ('Psp124BI' , 'GAGCT|C' ), ('Psp1406I' , 'AA|CGTT' ), ('Psp5II' , 'RG|GWCCY' ), ('Psp6I' , '|CCWGG' ), ('PspCI' , 'CAC|GTG' ), ('PspEI' , 'G|GTNACC' ), ('PspGI' , '|CCWGG' ), ('PspLI' , 'C|GTACG' ), ('PspN4I' , 'GGN|NCC' ), ('PspOMI' , 'G|GGCCC' ), ('PspOMII' , 'CGCCCAR|' ), ('PspPI' , 'G|GNCC' ), ('PspPPI' , 'RG|GWCCY' ), ('PspPRI' , 'CCYCAG|' ), ('PspXI' , 'VC|TCGAGB' ), ('PsrI' , 'GAACNN|NNNNTAC' ), ('PssI' , 'RGGNC|CY' ), ('PstI' , 'CTGCA|G' ), ('PstNI' , 'CAGNNN|CTG' ), ('PsuI' , 'R|GATCY' ), ('PsyI' , 'GACN|NNGTC' ), ('PteI' , 'G|CGCGC' ), ('PvuI' , 'CGAT|CG' ), ('PvuII' , 'CAG|CTG' ), ('R2_BceSIV' , '|GCAGC' ), ('RceI' , 'CATCGAC|' ), ('RdeGBI' , 'CCGCAG|CCGCAG' ), ('RdeGBII' , 'ACCCAG|' ), ('RdeGBIII' , '|TGRYCA' ), ('RflFIII' , 'CGCCAG|CGCCAG' ), ('RgaI' , 'GCGAT|CGC' ), ('RigI' , 'GGCCGG|CC' ), ('RlaI' , 'VCW|VCW' ), ('RleAI' , 'CCCACA|' ), ('RpaB5I' , 'CGRGGAC|' ), ('RpaBI' , 'CCCGCAG|' ), ('RpaI' , 'GTYGGAG|' ), ('RpaTI' , 'GRTGGAG|GRTGGAG' ), ('RruI' , 'TCG|CGA' ), ('RsaI' , 'GT|AC' ), ('RsaNI' , 'G|TAC' ), ('RseI' , 'CAYNN|NNRTG' ), ('Rsr2I' , 'CG|GWCCG' ), ('RsrII' , 'CG|GWCCG' ), ('SacI' , 'GAGCT|C' ), ('SacII' , 'CCGC|GG' ), ('SalI' , 'G|TCGAC' ), ('SanDI' , 'GG|GWCCC' ), ('SapI' , 'GCTCTTC|' ), ('SaqAI' , 'T|TAA' ), ('SatI' , 'GC|NGC' ), ('Sau3AI' , '|GATC' ), ('Sau96I' , 'G|GNCC' ), ('SauI' , 'CC|TNAGG' ), ('SbfI' , 'CCTGCA|GG' ), ('ScaI' , 'AGT|ACT' ), ('SchI' , 'GAGTC|' ), ('SciI' , 'CTC|GAG' ), ('ScrFI' , 'CC|NGG' ), ('SdaI' , 'CCTGCA|GG' ), ('SdeAI' , 'CAGRAG|' ), ('SdeOSI' , '|GACNNNNRTGA' ), ('SduI' , 'GDGCH|C' ), ('SecI' , 'C|CNNGG' ), ('SelI' , '|CGCG' ), ('SetI' , 'ASST|' ), ('SexAI' , 'A|CCWGGT' ), ('SfaAI' , 'GCGAT|CGC' ), ('SfaNI' , 'GCATC|' ), ('SfcI' , 'C|TRYAG' ), ('SfeI' , 'C|TRYAG' ), ('SfiI' , 'GGCCNNNN|NGGCC' ), ('SfoI' , 'GGC|GCC' ), ('Sfr274I' , 'C|TCGAG' ), ('Sfr303I' , 'CCGC|GG' ), ('SfuI' , 'TT|CGAA' ), ('SgeI' , 'CNNG|' ), ('SgfI' , 'GCGAT|CGC' ), ('SgrAI' , 'CR|CCGGYG' ), ('SgrBI' , 'CCGC|GG' ), ('SgrDI' , 'CG|TCGACG' ), ('SgrTI' , 'CCDS|' ), ('SgsI' , 'GG|CGCGCC' ), ('SimI' , 'GG|GTC' ), ('SlaI' , 'C|TCGAG' ), ('SmaI' , 'CCC|GGG' ), ('SmiI' , 'ATTT|AAAT' ), ('SmiMI' , 'CAYNN|NNRTG' ), ('SmlI' , 'C|TYRAG' ), ('SmoI' , 'C|TYRAG' ), ('SnaBI' , 'TAC|GTA' ), ('SnaI' , 'GTATAC|GTATAC' ), ('Sno506I' , 'GGCCGAG|GGCCGAG' ), ('SpeI' , 'A|CTAGT' ), ('SphI' , 'GCATG|C' ), ('SplI' , 'C|GTACG' ), ('SpoDI' , 'GCGGRAG|GCGGRAG' ), ('SrfI' , 'GCCC|GGGC' ), ('Sse232I' , 'CG|CCGGCG' ), ('Sse8387I' , 'CCTGCA|GG' ), ('Sse8647I' , 'AG|GWCCT' ), ('Sse9I' , '|AATT' ), ('SseBI' , 'AGG|CCT' ), ('SsiI' , 'C|CGC' ), ('SspD5I' , 'GGTGA|' ), ('SspDI' , 'G|GCGCC' ), ('SspI' , 'AAT|ATT' ), ('SstE37I' , 'CGAAGAC|' ), ('SstI' , 'GAGCT|C' ), ('Sth132I' , 'CCCG|' ), ('Sth302II' , 'CC|GG' ), ('StrI' , 'C|TCGAG' ), ('StsI' , 'GGATG|' ), ('StuI' , 'AGG|CCT' ), ('StyD4I' , '|CCNGG' ), ('StyI' , 'C|CWWGG' ), ('SwaI' , 'ATTT|AAAT' ), ('TaaI' , 'ACN|GT' ), ('TaiI' , 'ACGT|' ), ('TaqI' , 'T|CGA' ), ('TaqII' , 'GACCGA|' ), ('TasI' , '|AATT' ), ('TatI' , 'W|GTACW' ), ('TauI' , 'GCSG|C' ), ('TfiI' , 'G|AWTC' ), ('Tru1I' , 'T|TAA' ), ('Tru9I' , 'T|TAA' ), ('TscAI' , 'CASTG|' ), ('TseFI' , '|GTSAC' ), ('TseI' , 'G|CWGC' ), ('TsoI' , 'TARCCA|' ), ('Tsp45I' , '|GTSAC' ), ('Tsp4CI' , 'ACN|GT' ), ('TspDTI' , 'ATGAA|' ), ('TspEI' , '|AATT' ), ('TspGWI' , 'ACGGA|' ), ('TspMI' , 'C|CCGGG' ), ('TspRI' , 'CASTG|' ), ('TssI' , 'GAGNNNCTC|GAGNNNCTC' ), ('TstI' , 'CACN|NNNNNTCC' ), ('TsuI' , 'GCGAC|GCGAC' ), ('Tth111I' , 'GACN|NNGTC' ), ('Tth111II' , 'CAARCA|' ), ('UbaF11I' , 'TCGTA|TCGTA' ), ('UbaF12I' , 'CTACNNNGTC|CTACNNNGTC' ), ('UbaF13I' , 'GAGNNNNNNCTGG|GAGNNNNNNCTGG' ), ('UbaF14I' , 'CCANNNNNTCG|CCANNNNNTCG' ), ('UbaF9I' , 'TACNNNNNRTGT|TACNNNNNRTGT' ), ('UbaPI' , 'CGAACG|CGAACG' ), ('UcoMSI' , '|GAGCTC' ), ('UnbI' , '|GGNCC' ), ('Van91I' , 'CCANNNN|NTGG' ), ('Vha464I' , 'C|TTAAG' ), ('VneI' , 'G|TGCAC' ), ('VpaK11AI' , '|GGWCC' ), ('VpaK11BI' , 'G|GWCC' ), ('VspI' , 'AT|TAAT' ), ('WviI' , 'CACRAG|' ), ('XagI' , 'CCTNN|NNNAGG' ), ('XapI' , 'R|AATTY' ), ('XbaI' , 'T|CTAGA' ), ('XceI' , 'RCATG|Y' ), ('XcmI' , 'CCANNNNN|NNNNTGG' ), ('XhoI' , 'C|TCGAG' ), ('XhoII' , 'R|GATCY' ), ('XmaI' , 'C|CCGGG' ), ('XmaIII' , 'C|GGCCG' ), ('XmaJI' , 'C|CTAGG' ), ('XmiI' , 'GT|MKAC' ), ('XmnI' , 'GAANN|NNTTC' ), ('XspI' , 'C|TAG' ), ('YkrI' , 'C|' ), ('ZraI' , 'GAC|GTC' ), ('ZrmI' , 'AGT|ACT' ), ('Zsp2I' , 'ATGCA|T' )])