ARB
Translate.cxx
Go to the documentation of this file.
1 // =============================================================== //
2 // //
3 // File : Translate.cxx //
4 // Purpose : Nucleotide->AA translation //
5 // //
6 // Coded by Ralf Westram (coder@reallysoft.de) in June 2006 //
7 // Institute of Microbiology (Technical University Munich) //
8 // http://www.arb-home.de/ //
9 // //
10 // =============================================================== //
11 
12 #include "Translate.hxx"
13 
14 #include <AP_pro_a_nucs.hxx>
15 #include <AP_codon_table.hxx>
16 #include <arbdbt.h>
17 #include <arb_global_defs.h>
18 #include <cctype>
19 
20 #define tl_assert(cond) arb_assert(cond)
21 
22 GB_ERROR translate_saveInfo(GBDATA *gb_species, int arb_transl_table, int codon_start) {
23  int embl_transl_table = TTIT_arb2embl(arb_transl_table);
24 
25  tl_assert(codon_start >= 0 && codon_start<3); // codon_start has to be 0..2
26  tl_assert(embl_transl_table >= 0);
27 
28  GB_ERROR error = GBT_write_string(gb_species, "transl_table", GBS_global_string("%i", embl_transl_table));
29  if (!error) error = GBT_write_string(gb_species, "codon_start", GBS_global_string("%i", codon_start+1));
30 
31  return error;
32 }
33 
36 
37  GBDATA *gb_transl_table = GB_entry(gb_species, "transl_table");
38  if (gb_transl_table) error = GB_delete(gb_transl_table);
39 
40  if (!error) {
41  GBDATA *gb_codon_start = GB_entry(gb_species, "codon_start");
42  if (gb_codon_start) error = GB_delete(gb_codon_start);
43  }
44 
45  return error;
46 }
47 
48 GB_ERROR translate_getInfo(GBDATA *gb_item, int& arb_transl_table, int& codon_start) {
49  // looks for sub-entries 'transl_table' and 'codon_start' of species (works for genes as well)
50  // if found -> test for validity and translate 'transl_table' from EMBL to ARB table number
51  //
52  // returns: an error in case of problems
53  //
54  // 'arb_transl_table' is set to -1 if not found, otherwise it contains the arb table number
55  // 'codon_start' is set to -1 if not found, otherwise it contains the codon_start (0..2)
56 
57  arb_transl_table = -1; // not found yet
58  codon_start = -1; // not found yet
59 
61  GBDATA *gb_transl_table = GB_entry(gb_item, "transl_table");
62 
63  if (gb_transl_table) {
64  int embl_table = atoi(GB_read_char_pntr(gb_transl_table));
65  arb_transl_table = TTIT_embl2arb(embl_table);
66  if (arb_transl_table == -1) { // ill. table
67  error = GBS_global_string("Illegal (or unsupported) value (%i) in 'transl_table'", embl_table);
68  }
69  }
70 
71  if (!error) {
72  GBDATA *gb_codon_start = GB_entry(gb_item, "codon_start");
73  if (gb_codon_start) {
74  int codon_start_value = atoi(GB_read_char_pntr(gb_codon_start));
75 
76  if (codon_start_value<1 || codon_start_value>3) {
77  error = GBS_global_string("Illegal value (%i) in 'codon_start' (allowed: 1..3)", codon_start_value);
78  }
79  else {
80  codon_start = codon_start_value-1; // internal value is 0..2
81  }
82  }
83  else if (arb_transl_table != -1) {
84  // default to codon_start 1
85  error = GBT_write_string(gb_item, "codon_start", "1");
86  if (!error) codon_start = 0; // internal value is 0..2
87  }
88  }
89 
90  if (!error && arb_transl_table != codon_start) {
91  if (arb_transl_table == -1) error = "Found 'codon_start', but 'transl_table' is missing";
92  else if (codon_start == -1) error = "Found 'transl_table', but 'codon_start' is missing";
93  }
94 
95  if (error) { // append species name to error message
96  error = GBS_global_string("%s (item='%s')", error, GBT_get_name_or_description(gb_item));
97  }
98 
99  return error;
100 }
101 
102 inline void memcpy3(char *dest, const char *source) {
103  dest[0] = source[0];
104  dest[1] = source[1];
105  dest[2] = source[2];
106 }
107 
108 int translate_nuc2aa(int arb_code_nr, char *data, size_t size, size_t pos, bool translate_all, bool create_start_codon, bool append_stop_codon, int *translatedSize) {
109  // if translate_all == true -> 'pos' > 1 produces a leading 'X' in protein data
110  // (otherwise nucleotides in front of the starting pos are simply ignored)
111  //
112  // if 'create_start_codon' is true and the first generated codon is a start codon of the used
113  // code, a 'M' is inserted instead of the codon
114  // if 'append_stop_codon' is true, the stop codon is appended as '*'. This is only done, if the last
115  // character not already is a stop codon. (Note: provide data with correct size)
116  //
117  // returns:
118  // - the translated protein sequence in 'data'
119  // - the length of the translated protein sequence in 'translatedSize' (if != 0)
120  // - number of stop-codons in translated sequence as result (incl. optional or appended stop-codons)
121 
122  bool create_stop_codon = true; // could be a param; acts similar to 'create_start_codon', but for optional stop codons
123 
124  arb_assert(pos <= 2);
125 
126  for (char *p = data; *p; p++) {
127  char c = *p;
128  if ((c>='a') && (c<='z')) c = c+'A'-'a';
129  if (c=='U') c = 'T';
130  *p = c;
131  }
132 
133  char codonBuf[4];
134  codonBuf[3] = 0;
135 
136  char *dest = data;
137 
138  if (pos && translate_all) {
139  for (char *p = data; p<data+pos; ++p) {
140  char c = *p;
141  if (!GAP::is_std_gap(c)) { // found a nucleotide
142  *dest++ = 'X';
143  break;
144  }
145  }
146  }
147 
148  int stops = 0;
149  size_t i = pos;
150  char startCodon = 0;
151 
152  AWT_translator translator(arb_code_nr);
153 
154  if (create_start_codon) {
155  memcpy3(codonBuf, data+pos);
156  startCodon = translator.isStartCodon(codonBuf);
157  }
158 
159  for (char *p = data+pos; i+2<size; p+=3, i+=3) {
160  memcpy3(codonBuf, p);
161 
162  char aa = translator.codon2aa(codonBuf);
163  if (aa == '*') ++stops;
164  arb_assert(!islower(aa));
165 
166  *(dest++) = aa;
167  }
168 
169  if (dest>data) { // at least 1 amino written
170  if (create_start_codon && startCodon) {
171  arb_assert(startCodon == 'M');
172  data[0] = startCodon;
173  }
174 
175  bool last_is_stop = dest[-1] == '*';
176  if (!last_is_stop) {
177  if (create_stop_codon && translator.isStopCodon(codonBuf)) { // correct optional stop-codon
178  arb_assert(translator.CodeNr()>=20); // appears first in table 20 (=EMBL 27)
179  dest[-1] = '*'; // use it (we are at EOS)
180  ++stops;
181  }
182  else if (append_stop_codon) {
183  *dest++ = '*';
184  ++stops;
185  }
186  }
187  }
188  dest[0] = 0;
189 
190  if (translatedSize) *translatedSize = dest-data;
191 
192  return stops;
193 }
194 
195 // --------------------------------------------------------------------------------
196 
197 #ifdef UNIT_TESTS
198 #ifndef TEST_UNIT_H
199 #include <test_unit.h>
200 #endif
201 
202 static arb_test::match_expectation translates_into(int arb_code_nr, const char *dna, const char *exp_transl, int exp_stops, int exp_size) {
203  using namespace arb_test;
204 
205  size_t dna_len = strlen(dna);
206  char *data = ARB_strduplen(dna, dna_len);
207 
208  int size;
209  int stops = translate_nuc2aa(arb_code_nr, data, dna_len, 0, false, true, true, &size);
210  // test all 3 reading frames?
211 
212  const char *translated = data;
213 
214  expectation_group expected(that(translated).is_equal_to(exp_transl));
215  expected.add(that(stops).is_equal_to(exp_stops));
216  expected.add(that(size).is_equal_to(exp_size));
217 
218  free(data);
219 
220  return all().ofgroup(expected);
221 }
222 
223 #define TEST_TRANSLATION(nr,dna,aa,stp,siz) TEST_EXPECTATION(translates_into(nr,dna,aa,stp,siz))
224 #define TEST_TRANSLATION__WANTED(nr,dna,aa,stp,siz) TEST_EXPECTATION__WANTED(translates_into(nr,dna,aa,stp,siz))
225 #define e2a(ec) TTIT_embl2arb(ec)
226 
227 void TEST_translate() {
228  TEST_TRANSLATION(e2a(1), "TTYTCN", "FS*", 1, 3); // stop-codon appended (dna does not end with stop)
229 
230  // test optional start-codons:
231  TEST_TRANSLATION(e2a(2), "ATCATCTTTTAR", "MIF*", 1, 4); // only std nucs
232  TEST_TRANSLATION(e2a(2), "ATYATYTTTTAR", "MIF*", 1, 4); // containing IUPAC-nucs
233  TEST_TRANSLATION(e2a(2), "ATAATATARTAR", "MM**", 2, 4); // ATA->M (always, i.e. non-optional)
234 
235  // test optional stop-codons:
236  TEST_TRANSLATION(e2a(27), "TGATGA", "W*", 1, 2); // only std nucs
237  TEST_TRANSLATION(e2a(28), "TGATGA", "W*", 1, 2);
238  TEST_TRANSLATION(e2a(28), "TAGTAG", "Q*", 1, 2);
239  TEST_TRANSLATION(e2a(28), "TAATAA", "Q*", 1, 2);
240  TEST_TRANSLATION(e2a(28), "TARTAR", "Q*", 1, 2); // containing IUPAC-nucs
241 
242  TEST_TRANSLATION(e2a(31), "TAGTAG", "E*", 1, 2); // only std nucs
243  TEST_TRANSLATION(e2a(31), "TAATAA", "E*", 1, 2);
244  TEST_TRANSLATION(e2a(31), "TARTAR", "E*", 1, 2); // containing IUPAC-nucs
245 }
246 
247 #endif // UNIT_TESTS
248 
249 // --------------------------------------------------------------------------------
#define arb_assert(cond)
Definition: arb_assert.h:245
const char * GB_ERROR
Definition: arb_core.h:25
char isStartCodon(const char *codon) const
group_matcher all()
Definition: test_unit.h:1000
GB_ERROR translate_removeInfo(GBDATA *gb_species)
Definition: Translate.cxx:34
char codon2aa(const char *codon) const
const char * GBS_global_string(const char *templat,...)
Definition: arb_msg.cxx:204
GB_ERROR GB_delete(GBDATA *&source)
Definition: arbdb.cxx:1904
int TTIT_embl2arb(int embl_code_nr)
void memcpy3(char *dest, const char *source)
Definition: Translate.cxx:102
char * ARB_strduplen(const char *p, unsigned len)
Definition: arb_string.h:33
int CodeNr() const
int translate_nuc2aa(int arb_code_nr, char *data, size_t size, size_t pos, bool translate_all, bool create_start_codon, bool append_stop_codon, int *translatedSize)
Definition: Translate.cxx:108
GB_ERROR translate_saveInfo(GBDATA *gb_species, int arb_transl_table, int codon_start)
Definition: Translate.cxx:22
int TTIT_arb2embl(int arb_code_nr)
static void error(const char *msg)
Definition: mkptypes.cxx:96
#define that(thing)
Definition: test_unit.h:1032
GB_ERROR translate_getInfo(GBDATA *gb_item, int &arb_transl_table, int &codon_start)
Definition: Translate.cxx:48
char isStopCodon(const char *codon) const
#define is_equal_to(val)
Definition: test_unit.h:1014
GB_ERROR GBT_write_string(GBDATA *gb_container, const char *fieldpath, const char *content)
Definition: adtools.cxx:451
bool is_std_gap(const char c)
#define NULp
Definition: cxxforward.h:97
#define tl_assert(cond)
Definition: Translate.cxx:20
GB_CSTR GB_read_char_pntr(GBDATA *gbd)
Definition: arbdb.cxx:898
GB_CSTR GBT_get_name_or_description(GBDATA *gb_item)
Definition: aditem.cxx:441
GBDATA * GB_entry(GBDATA *father, const char *key)
Definition: adquery.cxx:334