Python Assignment 4

Classes

Exercise 1: import Sequence

#!/usr/bin/env python
from sequence import Sequence
f = open('./ambiguous_sequences.seq','r')
for l in f.readlines():
    s = Sequence(l.rstrip())
    print 'Sequence has', s.__len__(), 'bases.'
    counts = s.letter_counts()
    for base, count in counts.items():
        print base, '=', count


Result:

Sequence has 1267 bases.
A = 287
C = 306
B = 1
G = 389
R = 1
T = 282
Y = 1
Sequence has 553 bases.
A = 119
C = 161
T = 131
G = 141
N = 1
Sequence has 1521 bases.
A = 402
C = 196
T = 471
G = 215
N = 237
...
...
Sequence has 1285 bases.
A = 327
Y = 1
C = 224
T = 371
G = 362
Sequence has 570 bases.
A = 158
C = 120
T = 163
G = 123
N = 6
Sequence has 1801 bases.
C = 376
A = 465
S = 1
T = 462
G = 497

Exercise 2: Sequence.to_protein()

#!/usr/bin/env python
from sequence import Sequence
DNA = raw_input('Enter DNA sequence: ').upper()
seq = Sequence(DNA)
for frame in (1, 2, 3):
    protein = seq.to_protein(rf=frame)
    print 'Frame %d: %s' % (frame, protein)


Result:

$  python e2.py 
Enter DNA sequence: ATGCCCAAGCTGAATAGCGTAGAGGGGTTTTAA
Frame 1: <START>PK<START>NSVEGF<STOP>
Frame 2: CPS<STOP>IA<STOP>RGF
Frame 3: AQAE<STOP>RRGVL

Standard Library

Exercise 1: find modules

Which module or modules do you think would be useful if you were faced with the following problems?

Answer: urllib.

import urllib
res = urllib.urlopen('http://augix.com')
res.read()

Answer: timeit.

import timeit
timeit.Timer('for i in xrange(10): oct(i)', 'gc.enable()').timeit()

Exercise 2: sys.argv

#!/usr/bin/env python
import sys
print 'Argument 1 was:', sys.argv[1]
print 'Argument 2 was:', sys.argv[2]


Result:

$ python e2.py sequences.seq motifs.txt
Argument 1 was: sequences.seq
Argument 2 was: motifs.txt

Exercise 3

#!/usr/bin/env python
import sys
f = open(sys.argv[1],'r')
lines = f.readlines()
for l in lines:
    seq = l.rstrip()
    counts = {}
    for s in seq:
        counts[s] = counts.get(s, 0) + 1
    print 'Sequence has', str(len(seq)), 'bases.'
    for key, value in counts.items():
        print key, '=', value


Result:

$ ./e3.py ../../Assignment2/1.Dict/ambiguous_sequences.seq 
Sequence has 1267 bases.
A = 287
C = 306
B = 1
...

Using modules

Exercise 1: csv module

#!/usr/bin/env python
f = open('./numbers1.csv','r')
import csv
lines = csv.reader(f)
for l in lines:
    print sum(map(int,l))


Result:

$ ./e1.py 
716
1139
11
1707
1516

Exercise 2: csv.Sniffer()

#!/usr/bin/env python
csvfile = open('./numbers2.csv','r')
import csv
dialect = csv.Sniffer().sniff(csvfile.read(20))
csvfile.seek(0)
lines = csv.reader(csvfile,dialect)
for l in lines:
    print sum(map(int,l))


Result:

$ ./e2.py 
716
1139
11
1707
1516

Regular expressions

Exercise 1

#!/usr/bin/env python

import re

match = ['pit', 'spot', 'spate', 'slap two', 'respite']
no_match = ['', 'Pit', 'peat', 'part', 'sport']

pattern = r"p.t"   # This is your regex pattern!

regex = re.compile(pattern)

for S in match:
  if not regex.search(S):
    print "Your regex doesn't match", S

for N in no_match:
  if regex.search(N):
    print "Your regex matches", N

Exercise 2: Sentence endings

#!/usr/bin/env python

import re

match = ["assumes word senses. Within",
         "does the clustering. In the",
         "but when? It was hard to tell",
         'he arrive." After she had',
         "mess! He did not let it",
         "it wasn't hers!' She replied",
         "always thought so.) Then"]

no_match = ["in the U.S.A., people often",
            'John?", he often thought, but',
            "weighed 17.5 grams",
            "well ... they'd better not",
            "A.I. has long been a very",
            'like that", he thought',
            "but W. G. Grace never had much"]

pattern = r"\w\w[\.?!]{1}[\'\"\)]?\s"        # This is your regex pattern!

regex = re.compile(pattern)

for S in match:
  if not regex.search(S):
    print "Your regex doesn't match", S

for N in no_match:
  if regex.search(N):
    print "Your regex matches", N

Exercise 3: ORF

#!/usr/bin/env python
def ORF(seq):
    import re
    pattern = r'ATG([ACGT]{3})+(TAA|TAG|TGA)'
    regex = re.compile(pattern)
    res = regex.search(seq)
    if res:
        frame = res.start() % 3 + 1
        length = len(res.group(0))
        print 'In sequence:\n', seq, '\nFrame', frame, 'contains the longest open reading frame with', length, 'nucleotides.'
        return (frame, length)
    else:
        print 'In sequence:\n', seq, '\nNo ORF found!'
        return None

seq = 'ATGCCCAAGCTGAATAGCGTAGAGGGGTTTTAA'
ORF(seq)
seq = 'ACGATGCTGCACATGCAACTTGACGACGAC'
ORF(seq)


Result:

$ python e3.py 
In sequence:
ATGCCCAAGCTGAATAGCGTAGAGGGGTTTTAA 
Frame 1 contains the longest open reading frame with 33 nucleotides.
In sequence:
ACGATGCTGCACATGCAACTTGACGACGAC 
No ORF found!
Homepage
Comments

Hide Comments