defrevcomp(seq): return complement(seq)[::-1] # Input n site, all CDs sequences and trap37bp sequences, and check whether the site can stop # 1 for success, 0 for failure
defstop_test_for_Z(n, cds_lst, trap): new_cds_lst = [cds for cds in cds_lst if trap in cds] score_lst = [] iflen(new_cds_lst) > 0: for haha in new_cds_lst: m = haha.find(trap) + 1 + 10 score = (m - 1 + n - 1) score_lst.append(score) iflen([new_score for new_score in score_lst if score % 3 == 0]) > 0: return1 else: return0 else: return0
defstop_test_for_F(n, cds_lst, trap): trap = revcomp(trap) new_cds_lst = [cds for cds in cds_lst if trap in cds] score_lst = [] iflen(new_cds_lst) > 0: for haha in new_cds_lst: m = haha.find(trap) + 1 + 10 + 23 score = (m - n) score_lst.append(score) iflen([new_score for new_score in score_lst if score % 3 == 0]) > 0: return1 else: return0 else: return0 # Read all CDS files and return a dictionary. The key is gene and the value is CDS sequence
defread_cds(fs): fasta = {} withopen(fs, 'r') as f: for line in f: if line.startswith(">"): seq_lst = [] name = line.strip("\n").replace(">", "") else: seq_lst.append(line.strip("\n")) fasta[name] = "".join(seq_lst) new_fasta = {key: value for key, value in fasta.items() if"unavailable"notin value} return new_fasta # Read all the trap sequences and return a dictionary. The key is trap label and the value is 37bp
defread_seq(fs): fasta = {} withopen(fs, 'r') as f: for line in f: if line.startswith(">"): seq_lst = [] name = line.strip("\n").replace(">", "") else: seq_lst.append(line.strip("\n")) fasta[name] = seq_lst return fasta # Read all reference sequences. The key is label and the value is 37bptrap sequence
defread_ref(fs): ref = {} withopen(fs, 'r') as f: for line in f: if line.startswith(">"): name = line.strip("\n").replace(">", "") else: ref[name] = line.strip("\n")[118:155] return ref # Read the corresponding relationship between gene and label and return to the dictionary
defread_duiying(fs): duiying = {} withopen(fs, 'r') as f: for line in f: lable, gene = line.split("\t") duiying[lable.strip("\n")] = gene.strip("\n") return duiying
defmotif_Z(trap, site): if trap[10:30][site: site + 3] in ["CAG", "CGA", "CAA"]: return1 else: return0
defmotif_F(trap, site): if"CCA"in trap[10:30][site-2: site + 3]: return1 else: return0
lable_map_gene = read_duiying("../duiying.txt") all_cds = read_cds("../mart_export.txt") all_ref = read_ref("ref.fa") all_trapseq = read_seq("new508.filter.fa") eff_lst = [] for name, traps in all_ref.items(): gene = lable_map_gene[name] cds_lst = [vals for keys, vals in all_cds.items() if gene in keys] pass_lst = [] if name inlist(all_trapseq.keys()): alltrap = len(all_trapseq[name]) seq_lst = all_trapseq[name] for m in re.finditer("C", traps[10:30]): if motif_Z(traps, m.start()): if stop_test_for_Z((m.start() + 1), cds_lst, traps): while seq_lst: gg=seq_lst.pop(0) if gg[10:30][m.start()] == "T": pass_lst.append(gg) iflen(pass_lst) > 0: withopen("pass_lst_z.fa", 'a') as wocao: wocao.write(">{}\n".format(name)) for nima in pass_lst: wocao.write("{}\n".format(nima)) if alltrap > 0: eff_lst.append([name, len(pass_lst) / alltrap]) else: print(name)
for name, traps in all_ref.items(): gene = lable_map_gene[name] cds_lst = [vals for keys, vals in all_cds.items() if gene in keys] pass_lst = [] if name inlist(all_trapseq.keys()): alltrap = len(all_trapseq[name]) seq_lst2 = all_trapseq[name] for m in re.finditer("C", traps[10:30]): if motif_F(traps, m.start()): if stop_test_for_F(((traps[10:30][m.start()-2: m.start() + 3]).find("CCA")+1+2), cds_lst, traps): while seq_lst2: gg=seq_lst2.pop(0) if gg[10:30][m.start()] == "T": pass_lst.append(gg) iflen(pass_lst) > 0: withopen("pass_lst_f.fa", 'a') as wocao: wocao.write(">{}\n".format(name)) for nima in pass_lst: wocao.write("{}\n".format(nima)) if alltrap > 0: eff_lst.append([name, len(pass_lst) / alltrap]) else: print(name)
withopen("trap_stop_eff.txt", 'a') as l: l.write("trap-lable\ttrap-effective\n") for lable, effctive in eff_lst: l.write("{}\t{}\n".format(lable, effctive))
defstop_test_for_Z(n, cds_lst, trap): new_cds_lst = [cds for cds in cds_lst if trap in cds] score_lst = [] iflen(new_cds_lst) > 0: for haha in new_cds_lst: m = haha.find(trap) + 1 + 10 score = (m - 1 + n - 1) score_lst.append(score) iflen([new_score for new_score in score_lst if score % 3 == 0]) > 0: return1 else: return0 else: return0
defstop_test_for_F(n, cds_lst, trap): trap = revcomp(trap) new_cds_lst = [cds for cds in cds_lst if trap in cds] score_lst = [] iflen(new_cds_lst) > 0: for haha in new_cds_lst: m = haha.find(trap) + 1 + 10 + 23 score = (m - n) score_lst.append(score) iflen([new_score for new_score in score_lst if score % 3 == 0]) > 0: return1 else: return0 else: return0 # 读取所有的cds文件返回一个字典,键为gene,值为cds序列
defread_cds(fs): fasta = {} withopen(fs, 'r') as f: for line in f: if line.startswith(">"): seq_lst = [] name = line.strip("\n").replace(">", "") else: seq_lst.append(line.strip("\n")) fasta[name] = "".join(seq_lst) new_fasta = {key: value for key, value in fasta.items() if"unavailable"notin value} return new_fasta # 读取所有的trap序列返回一个字典,键为trap-lable,值为37bp列表
defread_seq(fs): fasta = {} withopen(fs, 'r') as f: for line in f: if line.startswith(">"): seq_lst = [] name = line.strip("\n").replace(">", "") else: seq_lst.append(line.strip("\n")) fasta[name] = seq_lst return fasta # 读取所有的参考序列,键为lable,值为37bptrap序列
defread_ref(fs): ref = {} withopen(fs, 'r') as f: for line in f: if line.startswith(">"): name = line.strip("\n").replace(">", "") else: ref[name] = line.strip("\n")[118:155] return ref # 读取基因和lable的对应关系,返回字典
defread_duiying(fs): duiying = {} withopen(fs, 'r') as f: for line in f: lable, gene = line.split("\t") duiying[lable.strip("\n")] = gene.strip("\n") return duiying
defmotif_Z(trap, site): if trap[10:30][site: site + 3] in ["CAG", "CGA", "CAA"]: return1 else: return0
defmotif_F(trap, site): if"CCA"in trap[10:30][site-2: site + 3]: return1 else: return0
lable_map_gene = read_duiying("../duiying.txt") all_cds = read_cds("../mart_export.txt") all_ref = read_ref("ref.fa") all_trapseq = read_seq("new508.filter.fa") withopen("pass_lst_f.txt", 'a') as wocao: wocao.write( "TRAP ID\tGene\tgRNA sequence\tstrand\tC>T position\tC>T efficiency\n") withopen("pass_lst_z.txt", 'a') as wocao: wocao.write( "TRAP ID\tGene\tgRNA sequence\tstrand\tC>T position\tC>T efficiency\n") for name, traps in all_ref.items(): gene = lable_map_gene[name] cds_lst = [vals for keys, vals in all_cds.items() if gene in keys] pass_lst = [] for m in re.finditer("C", traps[10:30]): if motif_Z(traps, m.start()): if stop_test_for_Z((m.start() + 1), cds_lst, traps) and name inlist(all_trapseq.keys()): i = 0 for gg in all_trapseq[name]: if gg[10:30][m.start()] == "T": i = i + 1 eff = i/len(all_trapseq[name]) pass_lst.append( [name, traps[10:33], "+", (m.start() + 1), eff]) iflen(pass_lst) > 0: withopen("pass_lst_z.txt", 'a') as wocao: for nima in pass_lst: [trap_id, trap_seq, strand, ctposition, efftion] = nima wocao.write("{}\t{}\t{}\t{}\t{}\t{}\n".format( trap_id, gene, trap_seq, strand, ctposition, efftion))
for name, traps in all_ref.items(): gene = lable_map_gene[name] cds_lst = [vals for keys, vals in all_cds.items() if gene in keys] pass_lst = [] for m in re.finditer("C", traps[10:30]): if motif_F(traps, m.start()): if stop_test_for_F(((traps[10:30][m.start()-2: m.start() + 3]).find("CCA")+1+2), cds_lst, traps) and name inlist(all_trapseq.keys()): j = 0 for gg in all_trapseq[name]: if gg[10:30][m.start()] == "T": j = j + 1 eff = j/len(all_trapseq[name]) pass_lst.append( [name, traps[10:33], "-", (m.start() + 1), eff]) iflen(pass_lst) > 0: withopen("pass_lst_f.txt", 'a') as wocao: for nima in pass_lst: [trap_id, trap_seq, strand, ctposition, efftion] = nima wocao.write("{}\t{}\t{}\t{}\t{}\t{}\n".format( trap_id, gene, trap_seq, strand, ctposition, efftion))