Dictionaries
Exercise 1: count bases
#!/usr/bin/env python seq = raw_input('Enter DNA: ') counts = {} for s in seq: counts[s] = counts.get(s, 0) + 1 for key, value in counts.items(): print key, '=', value
Result:
Enter DNA: ACRSAS A = 2 C = 1 R = 1 S = 2
Exercise 2: count for several sequences
#!/usr/bin/env python f = open('./ambiguous_sequences.seq') lines = f.readlines() for l in lines: seq = l.rstrip() counts = {} for s in seq: counts[s] = counts.get(s, 0) + 1 print 'Sequence has', str(len(seq)), 'bases.' for key, value in counts.items(): print key, '=', value
Result:
Sequence has 1267 bases. A = 287 C = 306 B = 1 G = 389 R = 1 T = 282 Y = 1 Sequence has 553 bases. A = 119 C = 161 T = 131 G = 141 N = 1 Sequence has 1521 bases. A = 402 C = 196 T = 471 G = 215 N = 237 ... ... Sequence has 1285 bases. A = 327 Y = 1 C = 224 T = 371 G = 362 Sequence has 570 bases. A = 158 C = 120 T = 163 G = 123 N = 6 Sequence has 1801 bases. C = 376 A = 465 S = 1 T = 462 G = 497
Exercise 3: sort keys
#!/usr/bin/env python f = open('./ambiguous_sequences.seq') lines = f.readlines() for l in lines: seq = l.rstrip() counts = {} for s in seq: counts[s] = counts.get(s, 0) + 1 print 'Sequence has', str(len(seq)), 'bases.' for key in sorted(counts.keys()): print key, '=', counts[key]
Result:
Sequence has 1267 bases. A = 287 B = 1 C = 306 G = 389 R = 1 T = 282 Y = 1 ...
Exercise 4: sum up
#!/usr/bin/env python import sys counts = {} for l in sys.stdin: seq = l.rstrip() for s in seq: counts[s] = counts.get(s, 0) + 1 print 'File has', str(sum(counts.values())), 'bases.' for key in sorted(counts.keys()): print key, '=', counts[key]
Result:
File has 24789 bases. A = 6504 B = 1 C = 5129 D = 1 G = 5868 K = 1 M = 1 N = 392 R = 3 S = 2 T = 6878 W = 1 Y = 8
Exercise 5: reverse complementary
#!/usr/bin/env python import sys counts = {} comp = dict(zip('ATGC','TACG')) def revcomp(seq): seq = list(seq) return ''.join([comp.get(nt,'N') for nt in seq[::-1]]) for line in sys.stdin: seq0 = line.rstrip() seq1 = revcomp(seq0) print seq0, '\n->\n', seq1, '\n'
Result:
$ python 5.py < 10_sequences.seq CCTGTATTAGCAGCAGATTCGATTAGCTTTACAACAATTCAATAAAATAGCTTCGCGCTAA -> TTAGCGCGAAGCTATTTTATTGAATTGTTGTAAAGCTAATCGAATCTGCTGCTAATACAGG ATTTTTAACTTTTCTCTGTCGTCGCACAATCGACTTTCTCTGTTTTCTTGGGTTTACCGGAA -> TTCCGGTAAACCCAAGAAAACAGAGAAAGTCGATTGTGCGACGACAGAGAAAAGTTAAAAAT ...
Exercise 6: ambiguous DNA complement
#!/usr/bin/env python import sys counts = {} comp = dict(zip('ATGC','TACG')) ambiguous_dna_complement = dict(zip('ACGTMRWSYKVHDBN','TGCAKYWSRMBDHVN')) def revcomp(seq): seq = list(seq) return ''.join([ambiguous_dna_complement[nt] for nt in seq[::-1]]) for line in sys.stdin: seq0 = line.rstrip() seq1 = revcomp(seq0) print seq0, '\n->\n', seq1, '\n'
Result:
$ python 6.py AMCRGTH AMCRGTH -> DACYGKT
Advanced exercises
Exercise 1
#!/usr/bin/env python input = open('./iupac_codes.txt') input.readline() # remove header d = dict() for line in input.readlines(): am, uns = line.rstrip().split() uns = uns.split(',') for un in uns: # assign ambiguous code to unambiguous code. if d.has_key(un): d[un] += ',' + am else: d[un] = am output = open('iupac_out.txt','w') print >>output, 'Nucl.\tCodes' for key, value in d.items(): print >>output, '\t'.join([key, value])
Result:
$ cat iupac_out.txt Nucl. Codes A A,M,R,W,V,H,D,N C C,M,S,Y,V,H,B,N T T,W,Y,K,H,D,B,N G G,R,S,K,V,D,B,N
Exercise 2: translate DNA to protein
#!/usr/bin/env python codon_table = { 'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L', 'TCT': 'S', 'TCC': 'S', 'TCA': 'S', 'TCG': 'S', 'TAT': 'Y', 'TAC': 'Y', 'TGT': 'C', 'TGC': 'C', 'TGG': 'W', 'CTT': 'L', 'CTC': 'L', 'CTA': 'L', 'CTG': 'L', 'CCT': 'P', 'CCC': 'P', 'CCA': 'P', 'CCG': 'P', 'CAT': 'H', 'CAC': 'H', 'CAA': 'Q', 'CAG': 'Q', 'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R', 'ATT': 'I', 'ATC': 'I', 'ATA': 'I', 'ATG': 'M', 'ACT': 'T', 'ACC': 'T', 'ACA': 'T', 'ACG': 'T', 'AAT': 'N', 'AAC': 'N', 'AAA': 'K', 'AAG': 'K', 'AGT': 'S', 'AGC': 'S', 'AGA': 'R', 'AGG': 'R', 'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V', 'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A', 'GAT': 'D', 'GAC': 'D', 'GAA': 'E', 'GAG': 'E', 'GGT': 'G', 'GGC': 'G', 'GGA': 'G', 'GGG': 'G' } stop_codons = ['TAA', 'TAG', 'TGA'] start_codons = ['TTG', 'CTG', 'ATG'] more = dict(zip(stop_codons, ' *')) codon_table.update(more) seq = raw_input('Enter DNA sequence: ').upper() codons = [seq[i*3:i*3+3] for i in range(len(seq)/3)] # alternatively, this keeps the extra bases at the end if the sequence length is not a multiple of 3. # codons = [seq[i:i+3] for i in range(0, len(seq), 3)] assert (codons[0] in start_codons), "First codon is not a start codon." acids = [codon_table.get(codon, '*') for codon in codons] print ''.join(acids)
Result:
Enter DNA sequence: atgcagtcagctagctagctagctagctagatcgtacgatcgacatagctcg MQSAS*LAS*IVRST*L
Hide Comments