From 2b289bd92c3f9d3ea8cbd5054f55fb1f2651b968 Mon Sep 17 00:00:00 2001 From: "W. Trevor King" Date: Tue, 2 Aug 2011 19:35:11 -0400 Subject: [PATCH] Add I27-synthesis post. --- posts/I27-synthesis.mdwn | 469 ++++++++++++++++++++++++++++++++ posts/I27-synthesis/mRNAcode.py | 309 +++++++++++++++++++++ 2 files changed, 778 insertions(+) create mode 100644 posts/I27-synthesis.mdwn create mode 100755 posts/I27-synthesis/mRNAcode.py diff --git a/posts/I27-synthesis.mdwn b/posts/I27-synthesis.mdwn new file mode 100644 index 0000000..f56ba95 --- /dev/null +++ b/posts/I27-synthesis.mdwn @@ -0,0 +1,469 @@ +Over the last few days I've been trying to teach myself enough +genetics to reconstruct [Carrion-Vazquez's poly-I27 synthesis +procedure][cv99]. I'm not quite there yet, but I feel like I've made +enough progress that it's worth posting my notes somewhere public in +case they are useful to others. + +Overview +======== + +We buy our poly-I27 from [AthenaES][], who market it as [I27O™][I27O]. +Perusing their [technical brief][I27O-tb], makes it clear that I2O7™ +corresponds to Carrion-Vazquez's I27RS₈. In +[Carrion-Vazquez' original paper][cv99] they describe the synthesis of +both I27RS₈ and a variant I27GLG₁₂. Their +I27RS₈ procedure is: + +* Human cardiac muscle used to generate a cDNA library [Rief 1997] +* cDNA library amplified with PCR + - 5' primer contained a BamHI restriction site that permitted + in-frame cloning of the monomer into the expression vector pQE30 + (Qiagen, Chatsworth, CA). + - The 3' primer contained a BglII restriction site, two Cys codons + located 3' to the BglII site and in-frame with the I27 domain, + and two in-frame stop codons. +* The PCR product was cloned into pUC19 linearized with BamHI and SmaI. +* The 8-domain synthetic gene was constructed by iterative cloning + of monomer into monomer, dimer into dimer, and tetramer into + tetramer. +* The final construct contained eight direct repeats of the I27 + domain, an amino-terminal His tag for purification, and two + carboxyl-terminal Cys codons used for covalent attachment to the + gold-covered coverslips. + +They also give the full-length sequence of I27RS₈: + + Met-Arg-Gly-Ser-(His)₆-Gly-Ser-(I27-Arg-Ser)₇-I27-...-Cys-Cys + +They point out the Arg-Ser (RS) amino acid sequence is the BamHI-BglII +hybrid site, which makes sense (see below). + +Back on the Athena site, they have a [page describing their +procedure][I27O-syn] (they reference the Carrion-Vazquez paper). They +claim to use the restriction enzyme KpnI in addition to BamHI, BglII, +and SmaI. + +Carrion-Vazquez points to the following references: + +* [Kempe et al. 1985][k85] (CV16), the source of the multi-step cloning technique. +* [Rief et al.][r97] (CV10), for I27 subcloning. + +Rief +---- + +In their note 11, Rief et al. explain their synthesis procedure: + +* λ cDNA library +* Titin fragments of interest were amplified by PCR +* cloned into pET 9d +* NH₂-terminal domain boundaries were as in [Politou 1996]. +* The clones were fused with an NH₂-terminal His₆ tag and a + COOH-terminal Cys₂ tag for immobilization on solid surfaces. + +which doesn't help me very much. + +Kemp +---- + +The [Kempe article][k85] is more informative, focusing entirely on the +synthesis procedure (albiet for a different gene). Their figure 2 +outlines the general approach, and used the following restriction +enzymes: PstI, BamHI, PstI, and BglII. I'll walk through their +procedure in detail below. + +Genetic code +------------ + +Wikipedia has a good page on the [genetic code][gcode] for converting +between DNA/mRNA codons and amino acids. I've written up a little +[[Python]] script, [[mRNAcode.py]], to automate the conversion of +various sequences, which helped me while I was writing this post. I'm +sure there are tons of similar programs out there, so don't feel +pressured to use mine ;). + +Restriction enzymes +------------------- + +We'll use the following [restriction enzymes][renz] + +[BamHI][] + + 5' G|GATC C 3' + 3' C CTAG|G 5' + +[BglI][] (N is any nucleotide) + + 5' GCCN NNN|NGGC 3' + 3' CGGN|NNN NCCG 5' + +[BglII][] + + 5' A|GATC T 3' + 3' T CTAG|A 5' + +[HindIII][] + + 5' A|AGCT T 3' + 3' T TCGA|A 5' + +[KpnI][] + + 5' G GTAC|C 3' + 3' C|CATG G 5' + +[PstI][] + + 5' C TGCA|G 3' + 3' G|ACGT C 5' + +[SmaI][] + + 5' CCC|GGG 3' + 3' GGG|CCC 5' + +Details +======= + +Here's my attempt to reconstruct the details of the polymer-cloning +reactions, where they splice several copies of I27 into the expression +plasmid. + +Kempe procedure +--------------- + +Inserted their poly-SP into pHK414 (I haven't been able to find any +online sources for pHK414. Kempe cites [R.J. Watson et al. +*Expression of Herpes simplex virus type 1 and type 2 glyco-protein D +genes using the Escherichia coli lac promoter.* Y. Becker (Ed.), +*Recombinant DNA Research and Viruses*. Nijhoff, The Hague, 1985, +pp. 327-352.][w85]) + +### Synthetic SP + + HindIII. ,BamHI_. + | | Met Arg Pro Lys Pro Gln Gln Phe Phe Gly Leu Met | + 5’ GA AGC TTC ATG CGT CCG AAG CCG CAG CAG TTC TTC GGT CTC ATG GAT CCG + CT TCG AAG TAC GCA GGC TTC GGC GTC GTC AAG AAG CCA GAG TAC CTA GGC 5’ + +### pHK414 + + _______Linker_sequence______ + / \ + HindIII BamHI + ,PstI. BglII.| |,SmaI. | + CTGCAG...AGATCTAAGCTTCCCGGGGATCCAAGATCC + GACGTC...TCTAGATTCGAAGGGCCCCTAGGTTCTAGG + . . + ....................................... + +### Synthesizing pSP4-1 + +#### pHK414 + HindIII + BamHI + + HindIII BamHI. + (PstI) BglII,| | + CTGCAG...AGATCTA GATCCAAGATCC + GACGTC...TCTAGATTCGA GTTCTAGG + . . + ....................................... + +#### SP + HindIII + BamHI + + HindIII. ,BamHI_. + | | Met Arg Pro Lys Pro Gln Gln Phe Phe Gly Leu Met | + AGC TTC ATG CGT CCG AAG CCG CAG CAG TTC TTC GGT CTC ATG + AG TAC GCA GGC TTC GGC GTC GTC AAG AAG CCA GAG TAC CTA G + +#### pSP4-1 + + HindIII BamHI. + ,PstI. BglII.| | MetArgProLysProGlnGlnPhePheGlyLeuMet | + CTGCAG...AGATCTAAGCTTCATGCGTCCGAAGCCGCAGCAGTTCTTCGGTCTCATGGATCCAAGATCC + GACGTC...TCTAGATTCGAAGTACGCAGGCTTCGGCGTCGTCAAGAAGCCAGAGTACCTAGGTTCTAGG + . . + ...................................................................... + +Using `-SP-` to abbreviate the HindIII→Met→Met portion (less the +terminal G, which is part of the BamHI match sequence). + + ,PstI. BglII. BamHI. + CTGCAG...AGATCT-SP-GGATCC + GACGTC...TCTAGA-SP-CCTAGG + . . + ......................... + +### Synthesizing pSP4-2 + +pSP4-1 is split in two parallel rections + +#### PstI + BamHI + + G...AGATCT-SP-G + ACGTC...TCTAGA-SP-CCTAGG + +#### PstI + BglII + + CTGCA GATCT-SP-GGATCC + G A-SP-CCTAGG + . . + ......................... + +#### pSP4-2 + +Then the SP-containing fragments (shown above) are isolated and mixed +together to form pSP4-2. + + ,PstI. BglII. other. BamHI. + CTGCAG...AGATCT-SP-GGATCT-SP-GGATCC + GACGTC...TCTAGA-SP-CCTAGA-SP-CCTAGG + . . + ................................... + +where the "other" sequence is the result of the BamHI/BglII splice. +Expanding the "-SP-" abbreviation around the SP joint: + + ....SP,other_.HindIII. SP..... + Leu Met Asp Leu Ser Phe Met Arg + CTC ATG GAT CTA AGC TTC ATG CGT + AGA CGT TCG AGC CTA GGA CGT ATG + +So the resulting poly-SP will have Asp-Leu-Ser-Phe linking amino +acids. + +By repeating the PstI + BamHI / PstI + BglII split-and-join, you can +synthesize plasmids with any number of SP repeats. + +I27RS₈ procedure +--------------------------- + +Like Kempe, Carrion-Vazquez et al. flank the I27 gene with BglII and +BamHI, but they reverse the order. Here's the output of their PCR: + + BamHI-I27-BglII-Cys-Cys-STOP-STOP + +From the PDB entry for I27 ([1TIT][]), the amino acid sequence is: + + ,leader_. + MHHHHHHSSLIEVEKPLYGVEVFVGETAHFEIELSEPDVHGQWKLKGQPLTASPDCEIIEDGKKHILI + LHNCQLGMTGEVSFQAANAKSAANLKVKEL + +To translate this into cDNA, I've scanned thorough the sequence of +[NM_003319.4][], and found a close match from nucleotides 15991 +through 16248. + + 15982 CTAATAAAAG TGGAAAAGCC TCTGTACGGA GTAGAGGTGT TTGTTGGTGA + 16032 AACAGCCCAC TTTGAAATTG AACTTTCTGA ACCTGATGTT CACGGCCAGT + 16082 GGAAGCTGAA AGGACAGCCT TTGACAGCTT CCCCTGACTG TGAAATCATT + 16132 GAGGATGGAA AGAAGCATAT TCTGATCCTT CATAACTGTC AGCTGGGTAT + 16182 GACAGGAGAG GTTTCCTTCC AGGCTGCTAA TGCCAAATCT GCAGCCAATC + 16232 TGAAAGTGAA AGAATTG + +This cDNA match generates an amino acid starting with LIKVEK instead +of the expected LIEVEK, but the LIKVEK version matches amino acids +12677-12765 in [Q8WZ42][] (canonical titin), and there is a natural +variant listed for [12679 K->E][var]. + +Interestingly, this sequence contains a PstI site at nucleotides 16220 +through 16225. None of our other restriction enzymes have sites in +the I27 sequence. + +Carrion-Vazquez et al. list two vectors in their procedure, but I'm +not sure about their respective roles. + +### pQE30 + +[pQE30][pQE30-a] ([sequence][pQE30-b]) is listed as the "expression +vector", but I'm not sure why they would need a non-expression vector, +as they don't reference cross-vector subcloning after inserting their +I27 monomer into the plasmid. + +From the [Qiagen site][pQE30], the section around the linker +nucleotides 115 through 203 is: + + ,RGS-His epitope__________________. ,BamHI. + Met Arg Gly Ser His His His His His His Gly Ser Ala Cys Glu Leu ... + ATG AGA GGA TCG CAT CAC CAT CAC CAT CAC GGA TCC GCA TGC GAG CTC ... + CGT CTC TTC GAT ACG ACA ACG ACA ACG ACA TTC GAA TAC GTA TCT AGA ... + + ,SmaI__. + ,KpnI_. HindIII + Gly Thr Pro Gly Arg Pro Ala Ala Lys Leu Asn STOP + GGT ACC CCG GGT CGA CCT GCA GCC AAG CTT AAT TAG CTG AG + TTG CAA AAT TTG ATC AAG TAC TAA CCT AGG CCG GCT AGT CT + +However, there is no BglII site in this linker. In fact, there is no +BglII site in the entire pQE30 plasmid, so they'd need to use a third +restiction enzyme to insert their I27 (which does contain a trailing +BglII). + +### pUC19 + +From [BCCM/LMBP][pUC19-a] and [GenBank][pUC19-b], the section around +the linker nucleotides 233 through 289 is: + + ,SmaI_. + HindIII. ,PstI__. ,BamHI_. ,KpnI__. + Met STOP + AA GCT TGC ATG CCT GCA GGT CGA CTC TAG AGG ATC CCC GGG TAC CGA + + GCT CGA ATT C + +However, there is no BglII the entire pUC19 plasmid either, so they'd +need to use a third restiction enzyme to insert their I27. + +### Questions + +1. Why do Carrion-Vazquez et al. list two different plasmids? +2. What is the 3'-side restiction enzyme that Carrion-Vazquez et + al. use to insert their I27 into their plasmid? +3. What is the remote restriction enzyme that Carrion-Vazquez et + al. use to break their opened plasmids (Kempe PstI equivalent). +4. The BamHI and SmaI sites in pUC19 overlap, so it is unclear how you + could use both to "linearize" pUC19. It would seem that either one + would open the plasmid on its own, although I'm not sure you could + "heal" the blunt-ended SmaI cut. +5. Since the Arg-Ser joint is formed by a BglII/BamHI overlap, why are + there no BglII-coded amino acids after the last I27 in the I27RS₈ + sequence? If there is, why do Carrion-Vazquez et al. not + acknowledge it when they write [3]: + + The full-length construct, I27RS₈, results in the following + amino acid additions: (i) the amino-terminal sequence is + Met-Arg-Gly-Ser-(His)6-Gly-Ser-I27 codons; (ii) the junction + between the domains (BamHI-BglII hybrid site) is Arg-Ser; and + (iii) the protein terminates in Cys-Cys. + + Since they don't acknowledge an I27-Arg-Ser-Cys-Cys ending, might + there be more amino acids in the C terminal addition? + +### Working backward + +Since I'm stuck trying to get I27 into either plasmid, let's try and +work backward from + + Met-Arg-Gly-Ser-(His)₆-Gly-Ser-(I27-Arg-Ser)₇-I27-Cys-Cys + +The BglII/BamHI overlap would produce the expected Arg-Ser joint. + + BglII BamHI + A + GATCC = AGATCC = Arg-Ser + TCTAG G TCTAGG + +#### Final plasmid (pI27-8) + +The beginning of this sequence looks like the start of pQE30's linker, +so we'll assume the final plasmid was: + + Met-Arg-Gly-Ser-(His)₆-Gly-Ser-(I27-Arg-Ser)₇-I27-Arg-Ser-Cys-Cys + + remote ... ,RGS-His epitope__________________. ,BamHI. I27... + ... Met Arg Gly Ser His His His His His His Gly Ser Leu Ile ... + ??? ... ATG AGA GGA TCG CAT CAC CAT CAC CAT CAC GGA TCC CTA ATA ... + ??? ... CGT CTC TTC GAT ACG ACA ACG ACA ACG ACA TTC GAA GAT TAT ... + + ........I27 joint_. I27 ... final I27 ,BglII. continuation of pQE30? + ... Glu Leu Leu ... Leu Arg Ser Cys Cys STOPSTOP... + ... GAA TTG AGA TCC CTA ... TTG AGA TCT TGC TGC TAG TAG ... + ... CTT AAC TCT AGG GAT ... GAT CTC GAG GTA GTA GCT GCT ... + +#### Penultimate plasmid (pI27-4) + + remote ... ,RGS-His epitope__________________. ,BamHI. I27... + Met Arg Gly Ser His His His His His His Gly Ser Leu Ile ... + ??? ... ATG AGA GGA TCG CAT CAC CAT CAC CAT CAC GGA TCC CTA ATA ... + ??? ... CGT CTC TTC GAT ACG ACA ACG ACA ACG ACA TTC GAA GAT TAT ... + + ... I27 joint_. I27 ... fourth I27 ,BglII. continuation of pQE30? + ... Glu Leu Leu ... Leu Arg Ser Cys Cys STOPSTOP... + ... GAA TTG AGA TCC CTA ... TTG AGA TCT TGC TGC TAG TAG ... + ... CTT AAC TCT AGG GAT ... GAT CTC GAG GTA GTA GCT GCT ... + +##### pI27-4 + BamHI + remote + + remote ,BamHI. I27... + Leu Ile ... + ? GA TCC CTA ATA ... + ?? A GAT TAT ... + + ....... I27 joint_. I27 ... fourth I27 ,BglII. continuation of pQE30? + ... Glu Leu Leu ... Leu Arg Ser Cys Cys STOPSTOP... + ... GAA TTG AGA TCC CTA ... TTG AGA TCT TGC TGC TAG TAG ... + ... CTT AAC TCT AGG GAT ... GAT CTC GAG GTA GTA GCT GCT ... + +##### pI27-4 + BglII + remote + + remote ... ,RGS-His epitope__________________. ,BamHI. I27... + Met Arg Gly Ser His His His His His His Gly Ser Leu Ile ... + ?? ... ATG AGA GGA TCG CAT CAC CAT CAC CAT CAC GGA TCC CTA ATA ... + ? ... CGT CTC TTC GAT ACG ACA ACG ACA ACG ACA TTC GAA GAT TAT ... + + ....... I27 joint_. I27 ... fourth I27 ,BglII. + ... Glu Leu Leu ... Leu + ... GAA TTG AGA TCC CTA ... TTG A + ... CTT AAC TCT AGG GAT ... GAT CTC GA + +##### pI27-8 + + remote ... ,RGS-His epitope__________________. ,BamHI. I27... + Met Arg Gly Ser His His His His His His Gly Ser Leu Ile ... + ??? ... ATG AGA GGA TCG CAT CAC CAT CAC CAT CAC GGA TCC CTA ATA ... + ??? ... CGT CTC TTC GAT ACG ACA ACG ACA ACG ACA TTC GAA GAT TAT ... + + ....... I27 joint_. I27 ... fourth I27 ,other. I27... + ... Glu Leu Leu ... Leu Gly Ser Leu Ile ... + ... GAA TTG AGA TCC CTA ... TTG AGA TCC CTA ATA ... + ... CTT AAC TCT AGG GAT ... GAT CTC GAA GAT TAT ... + + ....... I27 joint_. I27 ... fourth I27 ,BglII. continuation of pQE30? + ... Glu Leu Leu ... Leu Arg Ser Cys Cys STOPSTOP... + ... GAA TTG AGA TCC CTA ... TTG AGA TCT TGC TGC TAG TAG ... + ... CTT AAC TCT AGG GAT ... GAT CTC GAG GTA GTA GCT GCT ... + +#### Continuing to the first plasmid, pI27-1 must have been + + remote ... ,RGS-His epitope__________________. ,BamHI. I27... + ... Met Arg Gly Ser His His His His His His Gly Ser Leu Ile ... + ??? ... ATG AGA GGA TCG CAT CAC CAT CAC CAT CAC GGA TCC CTA ATA ... + ??? ... CGT CTC TTC GAT ACG ACA ACG ACA ACG ACA TTC GAA GAT TAT ... + + ........I27 ,BglII. continuation of pQE30? + ... Glu Leu Arg Ser Cys Cys STOPSTOP... + ... GAA TTG AGA TCT TGC TGC TAG TAG ... + ... CTT AAC CTC GAG GTA GTA GCT GCT ... + +### Potential pQE30 insertion points + +* Kpn1 (present after BamHI in both plasmids) + +### Potential remote restriction enzymes + +* BglI (pQE30 nucleotides 2583-2593 (GCCGGAAGGGC), Amp-resistance + 3256-2396; pUC19 has two BglI sites (bad idea)) + + +[cv99]: http://dx.doi.org/10.1073/pnas.96.7.3694 +[AthenaES]: http://www.athenaes.com/ +[I27O]: http://www.athenaes.com/I27OAFMReferenceProtein.php +[I27O-tb]: http://www.athenaes.com/tech_brief_I27O_protein.php +[I27O-syn]: http://www.athenaes.com/Projects_Polyproteins.php +[k85]: http://dx.doi.org/10.1016/0378-1119(85)90318-X +[gcode]: http://en.wikipedia.org/wiki/Genetic_code +[renz]: http://en.wikipedia.org/wiki/Restriction_enzyme +[BamHI]: http://en.wikipedia.org/wiki/BamHI +[BglI]: http://en.wikipedia.org/wiki/List_of_restriction_enzyme_cutting_sites:_Bd-Bp#Bd_-_Bp +[BglII]: http://en.wikipedia.org/wiki/BglII +[HindIII]: http://en.wikipedia.org/wiki/HindIII +[KpnI]: http://en.wikipedia.org/wiki/List_of_restriction_enzyme_cutting_sites:_G-K#K +[PstI]: http://en.wikipedia.org/wiki/PstI +[SmaI]: http://en.wikipedia.org/wiki/List_of_restriction_enzyme_cutting_sites:_S#S +[w85]: http://books.google.com/books?id=eA6iSmR0I4wC +[1TIT]: http://www.pdb.org/pdb/explore/explore.do?structureId=1TIT +[NM_003319.4]: http://www.ncbi.nlm.nih.gov/nuccore/NM_003319 +[Q8WZ42] http://www.uniprot.org/blast/?about=Q8WZ42[12677-12765] +[var] http://web.expasy.org/cgi-bin/variant_pages/get-sprot-variant.pl?VAR_040140 +[pQE30-a] http://www.qiagen.com/literature/vectors_pqe.aspx +[pQE30-b] http://www.qiagen.com/literature/pqesequences/pqe-30w.txt +[pUC19-a] http://bccm.belspo.be/db/lmbp_plasmid_details.php?NM=pUC19 +[pUC19-b] http://www.ncbi.nlm.nih.gov/nucleotide/M77789?report=genbank + +[[!tag tags/theory]] diff --git a/posts/I27-synthesis/mRNAcode.py b/posts/I27-synthesis/mRNAcode.py new file mode 100755 index 0000000..c3fd51b --- /dev/null +++ b/posts/I27-synthesis/mRNAcode.py @@ -0,0 +1,309 @@ +#!/usr/bin/env python +# +# Copyright (C) 2011 W. Trevor King +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Lesser General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this program. If not, see +# . + +AMINO_ACIDS = { + 'Alanine': ['Ala', 'A'], + 'Argenine': ['Arg', 'R'], + 'Asparagine': ['Asn', 'N'], + 'Aspartic acid': ['Asp', 'D'], + 'Cystine': ['Cys', 'C'], + 'Glutamic acid': ['Glu', 'E'], + 'Glutamine': ['Gln', 'Q'], + 'Glycine': ['Gly', 'G'], + 'Histidine': ['His', 'H'], + 'Isoleucine': ['Ile', 'I'], + 'Leucine': ['Leu', 'L'], + 'Lysine': ['Lys', 'K'], + 'Methionine': ['Met', 'M'], + 'Phenylalanine': ['Phe', 'F'], + 'Proline': ['Pro', 'P'], + 'Serine': ['Ser', 'S'], + 'Threonine': ['Thr', 'T'], + 'Tryptophan': ['Trp', 'W'], + 'Tyrosine': ['Tyr', 'Y'], + 'Valine': ['Val', 'V'], + } + +NUCLEOTIDES = { + 'Adenine': ['A'], + 'Cytosine': ['C'], + 'Guanine': ['G'], + 'Thymine': ['T'], + 'Uracil': ['U'], + } + +NUCLEOTIDE_COMPLEMENT = { + 'Adenine': 'Cytosine', + 'Cytosine': 'Adenine', + 'Guanine': 'Thymine', + 'Thymine': 'Guanine', + 'Uracil': 'Guanine', + } + +CODE = { + ('Adenine', 'Adenine', 'Adenine'): 'Lysine', + ('Adenine', 'Adenine', 'Cytosine'): 'Asparagine', + ('Adenine', 'Adenine', 'Guanine'): 'Lysine', + ('Adenine', 'Adenine', 'Uracil'): 'Asparagine', + ('Adenine', 'Cytosine', 'Adenine'): 'Threonine', + ('Adenine', 'Cytosine', 'Cytosine'): 'Threonine', + ('Adenine', 'Cytosine', 'Guanine'): 'Threonine', + ('Adenine', 'Cytosine', 'Uracil'): 'Threonine', + ('Adenine', 'Guanine', 'Adenine'): 'Argenine', + ('Adenine', 'Guanine', 'Cytosine'): 'Serine', + ('Adenine', 'Guanine', 'Guanine'): 'Argenine', + ('Adenine', 'Guanine', 'Uracil'): 'Serine', + ('Adenine', 'Uracil', 'Adenine'): 'Isoleucine', + ('Adenine', 'Uracil', 'Cytosine'): 'Isoleucine', + ('Adenine', 'Uracil', 'Guanine'): 'Methionine', + ('Adenine', 'Uracil', 'Uracil'): 'Isoleucine', + ('Cytosine', 'Adenine', 'Adenine'): 'Glutamine', + ('Cytosine', 'Adenine', 'Cytosine'): 'Histidine', + ('Cytosine', 'Adenine', 'Guanine'): 'Glutamine', + ('Cytosine', 'Adenine', 'Uracil'): 'Histidine', + ('Cytosine', 'Cytosine', 'Adenine'): 'Proline', + ('Cytosine', 'Cytosine', 'Cytosine'): 'Proline', + ('Cytosine', 'Cytosine', 'Guanine'): 'Proline', + ('Cytosine', 'Cytosine', 'Uracil'): 'Proline', + ('Cytosine', 'Guanine', 'Adenine'): 'Argenine', + ('Cytosine', 'Guanine', 'Cytosine'): 'Argenine', + ('Cytosine', 'Guanine', 'Guanine'): 'Argenine', + ('Cytosine', 'Guanine', 'Uracil'): 'Argenine', + ('Cytosine', 'Uracil', 'Adenine'): 'Leucine', + ('Cytosine', 'Uracil', 'Cytosine'): 'Leucine', + ('Cytosine', 'Uracil', 'Guanine'): 'Leucine', + ('Cytosine', 'Uracil', 'Uracil'): 'Leucine', + ('Guanine', 'Adenine', 'Adenine'): 'Glutamic acid', + ('Guanine', 'Adenine', 'Cytosine'): 'Aspartic acid', + ('Guanine', 'Adenine', 'Guanine'): 'Glutamic acid', + ('Guanine', 'Adenine', 'Uracil'): 'Aspartic acid', + ('Guanine', 'Cytosine', 'Adenine'): 'Alanine', + ('Guanine', 'Cytosine', 'Cytosine'): 'Alanine', + ('Guanine', 'Cytosine', 'Guanine'): 'Alanine', + ('Guanine', 'Cytosine', 'Uracil'): 'Alanine', + ('Guanine', 'Guanine', 'Adenine'): 'Glycine', + ('Guanine', 'Guanine', 'Cytosine'): 'Glycine', + ('Guanine', 'Guanine', 'Guanine'): 'Glycine', + ('Guanine', 'Guanine', 'Uracil'): 'Glycine', + ('Guanine', 'Uracil', 'Adenine'): 'Valine', + ('Guanine', 'Uracil', 'Cytosine'): 'Valine', + ('Guanine', 'Uracil', 'Guanine'): 'Valine', + ('Guanine', 'Uracil', 'Uracil'): 'Valine', + ('Uracil', 'Adenine', 'Adenine'): 'STOP (Ochre)', + ('Uracil', 'Adenine', 'Cytosine'): 'Tyrosine', + ('Uracil', 'Adenine', 'Guanine'): 'STOP (Amber)', + ('Uracil', 'Adenine', 'Uracil'): 'Tyrosine', + ('Uracil', 'Cytosine', 'Adenine'): 'Serine', + ('Uracil', 'Cytosine', 'Cytosine'): 'Serine', + ('Uracil', 'Cytosine', 'Guanine'): 'Serine', + ('Uracil', 'Cytosine', 'Uracil'): 'Serine', + ('Uracil', 'Guanine', 'Adenine'): 'STOP (Opal)', + ('Uracil', 'Guanine', 'Cytosine'): 'Cystine', + ('Uracil', 'Guanine', 'Guanine'): 'Tryptophan', + ('Uracil', 'Guanine', 'Uracil'): 'Cystine', + ('Uracil', 'Uracil', 'Adenine'): 'Leucine', + ('Uracil', 'Uracil', 'Cytosine'): 'Phenylalanine', + ('Uracil', 'Uracil', 'Guanine'): 'Leucine', + ('Uracil', 'Uracil', 'Uracil'): 'Phenylalanine', + } + + +_INVERSE_DICTS = {} +def unabbreviate(abbreviations, abbreviation): + """ + >>> unabbreviate(AMINO_ACIDS, 'Ala') + 'Alanine' + >>> unabbreviate(AMINO_ACIDS, 'A') + 'Alanine' + >>> unabbreviate(NUCLEOTIDES, 'A') + 'Adenine' + """ + try: + inverse = _INVERSE_DICTS[id(abbreviations)] + except KeyError: + inverse = {} + for k,abbrevs in abbreviations.items(): + for abbrev in abbrevs: + inverse[abbrev] = k + _INVERSE_DICTS[id(abbreviations)] = inverse + inverse = _INVERSE_DICTS[id(abbreviations)] + return inverse[abbreviation] + +def decode_sequence(abbreviations, sequence): + """ + >>> list(decode_sequence(NUCLEOTIDES, 'ACG TU')) + ['Adenine', 'Cytosine', 'Guanine', 'Thymine', 'Uracil'] + """ + for x in sequence: + if x.isspace(): + continue + yield unabbreviate(abbreviations, x.upper()) + +def transcribe_to_mRNA(nucleotides): + """ + >>> list(transcribe_to_mRNA(['Adenine', 'Cytosine', 'Guanine', 'Thymine'])) + ['Adenine', 'Cytosine', 'Guanine', 'Uracil'] + """ + for n in nucleotides: + if n == 'Thymine': + yield 'Uracil' + else: + yield n + +def split_into_codons(nucleotides, length=3): + """ + >>> sequence = 'AGC TTC ATG CGT CCG AAG CC' + >>> nucleotides = decode_sequence(NUCLEOTIDES, sequence) + >>> codons = split_into_codons(nucleotides) + >>> print '\\n'.join(str(c) for c in codons) + ('Adenine', 'Guanine', 'Cytosine') + ('Thymine', 'Thymine', 'Cytosine') + ('Adenine', 'Thymine', 'Guanine') + ('Cytosine', 'Guanine', 'Thymine') + ('Cytosine', 'Cytosine', 'Guanine') + ('Adenine', 'Adenine', 'Guanine') + ('Cytosine', 'Cytosine') + """ + codon = [] + for n in nucleotides: + codon.append(n) + if len(codon) == length: + yield tuple(codon) + codon = [] + if len(codon): + yield tuple(codon) + +def translate_to_amino_acids(nucleotides): + """ + The input sequence should be mRNA nucleotides read from 5' to 3'. + + >>> sequence = 'AUG AGC UUC AUG CGU CCG AAG' + >>> nucleotides = decode_sequence(NUCLEOTIDES, sequence) + >>> amino_acids = translate_to_amino_acids(nucleotides) + >>> print '\\n'.join(amino_acids) + Methionine + Serine + Phenylalanine + Methionine + Argenine + Proline + Lysine + + The the leading Methionine is also the "start codon". There are + other possible start codon sequences (e.g. GUG) used in + prokaryotes such as E. coli. + """ + for codon in split_into_codons(nucleotides): + aa = CODE[codon] + yield aa + + +if __name__ == '__main__': + import argparse + import sys + + p = argparse.ArgumentParser( + description='Translate DNA/mRNA to an amino acid sequence.') + p.add_argument( + 'sequence', metavar='ACGTU', nargs='*', + help="Genetic sequence to translate (5' to 3')") + p.add_argument( + '-s', '--short', action='store_true', default=False, + help='Use single-letter amino acid abbreviations') + p.add_argument( + '-m', '--match', + help='Match a protein sequence in the source mRNA') + p.add_argument( + '-c', '--complement', action='store_true', default=False, + help='Print the complement DNA amd exit') + p.add_argument( + '--count', action='store_true', default=False, + help='Print amino acid, nucleotide, and codon counts and exit') + p.add_argument( + '--table', action='store_true', default=False, + help='Print translation tables and exit') + + args = p.parse_args() + + if args.count: + print('amino acids: {:d}'.format(len(AMINO_ACIDS))) + print('nucleotides: {:d}'.format(len(NUCLEOTIDES))) + print('codons: {:d}'.format(len(CODE))) + sys.exit() + elif args.table: + print('RNA Amino acid') + print('=== ==========') + order = ('Uracil', 'Cytosine', 'Adenine', 'Guanine') + for x_ in order: + x = NUCLEOTIDES[x_][-1] + for y_ in order: + y = NUCLEOTIDES[y_][-1] + for z_ in order: + z = NUCLEOTIDES[z_][-1] + aa_ = CODE[(x_, y_, z_)] + try: + aa = ' '.join(AMINO_ACIDS[aa_]) + except KeyError: + aa = aa_ + print('{}{}{} {}'.format(x, y, z, aa)) + print('') + print('AA RNA') + print('===== ===') + for aa_ in sorted(AMINO_ACIDS.keys()): + aa = ' '.join(AMINO_ACIDS[aa_]) + codons = sorted(''.join(NUCLEOTIDES[x][-1] for x in k) + for k,v in CODE.items() if v == aa_) + print('{} {}'.format(aa, ' '.join(codons))) + sys.exit() + + mRNA = ' '.join(args.sequence) + if args.complement: + for n in mRNA: + try: + nucleotide = unabbreviate(NUCLEOTIDES, n) + complement = NUCLEOTIDE_COMPLEMENT[nucleotide] + c = NUCLEOTIDES[complement][-1] + except KeyError: + c = n + sys.stdout.write(c) + sys.stdout.write('\n') + sys.exit() + + nucleotides = decode_sequence(NUCLEOTIDES, mRNA) + nucleotides = list(transcribe_to_mRNA(nucleotides)) # no-op on mRNA + amino_acids = list(translate_to_amino_acids(nucleotides)) + + if args.match: + match = list(decode_sequence(AMINO_ACIDS, args.match)) + for start in range(len(amino_acids)-len(match)): + fragment = amino_acids[start:start+len(match)] + if fragment == match: + start_n = start*3 + stop_n = start_n + 3*len(match) + print('matched nucleotides {:d} through {:d}'.format( + start_n, stop_n-1)) + print(''.join(NUCLEOTIDES[n][-1] + for n in nucleotides[start_n:stop_n])) + else: + if args.short: + sep = '' + i = -1 + else: + sep = '-' + i = 0 + print(sep.join(AMINO_ACIDS.get(aa, ['!'])[i] for aa in amino_acids)) -- 2.26.2