ARB
GEN_translations.cxx
Go to the documentation of this file.
1 // =============================================================== //
2 // //
3 // File : GEN_translations.cxx //
4 // Purpose : supports removal of redundant translations of //
5 // gene CDS //
6 // //
7 // Coded by Ralf Westram (coder@reallysoft.de) in January 2009 //
8 // Institute of Microbiology (Technical University Munich) //
9 // http://www.arb-home.de/ //
10 // //
11 // =============================================================== //
12 
13 #include "GEN_local.hxx"
14 
15 #include <Translate.hxx>
16 #include <AP_codon_table.hxx>
17 #include <aw_question.hxx>
18 #include <arbdbt.h>
19 
20 using namespace std;
21 
22 // -------------------------------------------------
23 // remove redundant translations from genes
24 
25 #if defined(WARN_TODO)
26 # warning add menu-entry to genome-NTREE ("Remove reproducible translations")
27 #endif
28 
29 static char *translate_gene_sequence(GBDATA *gb_gene, GB_ERROR& error, int& translated_length, char *startCodon) {
30  // return translation of gene sequence
31  // the start codon is copied into result buffer 'startCodon' (has to be sized 4 bytes)
32 
33  size_t gene_length;
34  char *gene_seq = GBT_read_gene_sequence_and_length(gb_gene, true, 0, &gene_length);
35  if (!gene_seq) error = GB_await_error();
36  else {
37  // store start codon in result buffer:
38  memcpy(startCodon, gene_seq, 3);
39  startCodon[3] = 0;
40 
41  int arb_transl_table, codon_start;
42  error = translate_getInfo(gb_gene, arb_transl_table, codon_start);
43 
44  if (arb_transl_table == -1) arb_transl_table = TTIT_embl2arb(1); // use embl table 1 (standard code)
45  if (codon_start == -1) codon_start = 0; // default codon start
46 
47  if (!error) translate_nuc2aa(arb_transl_table, gene_seq, gene_length, codon_start, false, true, true, &translated_length);
48 
49  if (error) {
50  free(gene_seq);
51  gene_seq = NULp;
52  }
53  }
54 
55  return gene_seq;
56 }
57 
59  GRS_NO_CHANGE = 0, // no translation found
60  GRS_FAILED = 1, // error is set
61  GRS_TRANSLATION_REMOVED = 2, // translation was present, reproducible and has been removed
62  GRS_TRANSLATION_FAILED = 4, // translation differed (wrote ARB translation to field 'ARB_translation')
63  GRS_START_CODON_WRONG = 8, // translation differed only in start codon
64  GRS_NOTE_ADDED = 16, // note has been added
65 };
66 
67 static GEN_remove_state remove_redundant_translation(GBDATA *gb_gene, bool ignore_start_codon_error, char *errornousCodon, GB_ERROR &error) {
68  // If translation can be re-produced by ARB,
69  // it will be removed
70  // ('ARB_translation' will be removed as well in this case)
71  // Otherwise
72  // a field 'ARB_translation' is inserted, which contains the translation generated by ARB.
73  //
74  // If result is GRS_START_CODON_WRONG, the questionable codon is copied into errornousCodon.
75  // (errornousCodon has to be a buffer with size == 4)
76  //
77  // If another code or codonstart translates fine, a hint shall be written to field 'translation_hint'
78 #if defined(WARN_TODO)
79 #warning TODO: If another code or codonstart translates fine, a hint shall be written to field 'translation_hint'
80 #endif
81 
83  char *add_note = NULp; // will be added as 'ARB_translation_note' (if set)
84  error = NULp;
85 
86 #define set_result_bit(s) result = GEN_remove_state(result|s)
87 
88  GBDATA *gb_translation = GB_entry(gb_gene, "translation");
89  if (gb_translation) {
90  int translated_length;
91  char *generated = translate_gene_sequence(gb_gene, error, translated_length, errornousCodon);
92 
93  if (!generated || translated_length<1) {
94  // insert note and continue
95  add_note = GBS_global_string_copy("Failed to translate gene-sequence (%s)", error);
96  error = NULp;
98  }
99  else {
100  if (generated[translated_length-1] == '*') {
101  generated[--translated_length] = 0; // cut off stop codon
102  }
103 
104  const char *original = GB_read_char_pntr(gb_translation);
105 
106  bool remove = false;
107  if (strcmp(generated+1, original+1) == 0) { // most of translation matches
108  if (generated[0] == original[0]) { // start codon matches
109  remove = true;
110  }
111  else { // start codon differs
113  remove = ignore_start_codon_error; // and delete if requested
114  }
115  }
116 
117  if (remove) { // remove translation and related entries
118  const char *to_remove[] = {
119  "translation",
120  "ARB_translation",
121  "ARB_translation_note",
122  NULp
123  };
124 
125  GB_ERROR err = NULp;
126  int failed_field = -1;
127 
128  for (int r = 0; to_remove[r] && !err; ++r) {
129  GBDATA *gb_remove = GB_entry(gb_gene, to_remove[r]);
130  if (gb_remove) {
131  err = GB_delete(gb_remove);
132  if (err) failed_field = r;
133  }
134  }
135  if (err) error = GBS_global_string("Failed to delete field '%s' (%s)", to_remove[failed_field], err);
136  else {
137  error = GBT_write_byte(gb_gene, "ARB_translation_rm", 1);
139  }
140  }
141  else {
142  error = GBT_write_string(gb_gene, "ARB_translation", generated);
144  }
145  }
146  free(generated);
147  }
148 
149  if (add_note && !error) {
150  error = GBT_write_string(gb_gene, "ARB_translation_note", add_note);
152  }
153 
154  if (error) result = GRS_FAILED;
155  free(add_note);
156 
157  return result;
158 
159 #undef set_result_bit
160 }
161 
162 GB_ERROR GEN_testAndRemoveTranslations(GBDATA *gb_gene_data, void (*warn)(AW_CL cd, const char *msg), AW_CL cd, AW_repeated_question *ok_to_ignore_wrong_start_codon) {
163  int ok = 0; // identical translations
164  int failed = 0; // non-identical translations
165  int wrong_start_codon = 0; // translations where start_codon differed
166  int no_entry = 0; // genes w/o 'translation' entry
167  int note_added = 0; // count gene for which a note has been added
168  GB_ERROR error = NULp;
169 
170  const int possibleCodons = 4*4*4;
171  GB_HASH *wrongStartCodons = GBS_create_hash(possibleCodons, GB_IGNORE_CASE);
172 
173  for (GBDATA *gb_gene = GB_entry(gb_gene_data, "gene"); gb_gene && !error; gb_gene = GB_nextEntry(gb_gene)) {
174  int retry = 0;
175  for (int Try = 0; Try <= retry && !error; Try++) {
176  error = NULp;
177 
178  char startCodon[4];
179  GEN_remove_state state = remove_redundant_translation(gb_gene, Try, startCodon, error);
180 
181  switch (state) {
182  case GRS_NO_CHANGE:
183  no_entry++;
184  break;
185 
186  case GRS_FAILED:
187  gen_assert(error);
188  break;
189 
190  default:
191  if (state&GRS_TRANSLATION_REMOVED) {
192  ok++;
193  }
194  else {
196  if (Try == 0) {
197  if (state&GRS_START_CODON_WRONG) {
198  wrong_start_codon++;
199  AW_repeated_question* q = ok_to_ignore_wrong_start_codon;
200 
201  if (q->get_answer("only_start_codon_differs",
202  "Translation differs only in start codon",
203  "Ignore and remove,Keep translation", "all", false) == 0) {
204  retry++;
205  }
206  else {
207  failed++;
208  }
209 
210  GBS_incr_hash(wrongStartCodons, startCodon);
211  }
212  else if (state&GRS_NOTE_ADDED) {
213  failed++;
214  note_added++;
215  }
216  }
217  else {
218  failed++;
219  }
220  }
221  break;
222  }
223  }
224  }
225 
226  if (!error && failed>0) {
227  warn(cd, GBS_global_string("%i translations could not be reproduced by ARB", failed));
228  static bool first_warning = true;
229  if (first_warning) { // show details once
230  warn(cd,
231  "Note: Reproducible translations were removed from database.\n"
232  " Failed translations were left in database and an additional\n"
233  " field 'ARB_translation' was added.");
234  warn(cd, GBS_global_string("- %i genes had no translation entry", no_entry));
235  warn(cd, GBS_global_string("- %i translations were reproducible", ok));
236  first_warning = false;
237  }
238  if (wrong_start_codon>0) {
239  char *codonInfo = GBS_hashtab_2_string(wrongStartCodons);
240  warn(cd, GBS_global_string("- %i translations had wrong start codon (%s)", wrong_start_codon, codonInfo));
241  free(codonInfo);
242  }
243  if (note_added>0) {
244  warn(cd, GBS_global_string("- %i ARB_translation_note entries were generated. Please examine!", note_added));
245  }
246  }
247 
248  GBS_free_hash(wrongStartCodons);
249 
250  return error;
251 }
252 
253 
254 
const char * GB_ERROR
Definition: arb_core.h:25
static GEN_remove_state remove_redundant_translation(GBDATA *gb_gene, bool ignore_start_codon_error, char *errornousCodon, GB_ERROR &error)
string result
long GBS_incr_hash(GB_HASH *hs, const char *key)
Definition: adhash.cxx:470
GBDATA * GB_nextEntry(GBDATA *entry)
Definition: adquery.cxx:339
GB_ERROR GEN_testAndRemoveTranslations(GBDATA *gb_gene_data, void(*warn)(AW_CL cd, const char *msg), AW_CL cd, AW_repeated_question *ok_to_ignore_wrong_start_codon)
const char * GBS_global_string(const char *templat,...)
Definition: arb_msg.cxx:204
STL namespace.
void GBS_free_hash(GB_HASH *hs)
Definition: adhash.cxx:541
#define gen_assert(bed)
Definition: GEN_local.hxx:19
GB_ERROR GB_delete(GBDATA *&source)
Definition: arbdb.cxx:1880
int TTIT_embl2arb(int embl_code_nr)
GB_ERROR GB_await_error()
Definition: arb_msg.cxx:353
int translate_nuc2aa(int arb_code_nr, char *data, size_t size, size_t pos, bool translate_all, bool create_start_codon, bool append_stop_codon, int *translatedSize)
Definition: Translate.cxx:108
static void error(const char *msg)
Definition: mkptypes.cxx:96
static char * translate_gene_sequence(GBDATA *gb_gene, GB_ERROR &error, int &translated_length, char *startCodon)
GB_ERROR translate_getInfo(GBDATA *gb_item, int &arb_transl_table, int &codon_start)
Definition: Translate.cxx:48
GB_ERROR GBT_write_byte(GBDATA *gb_container, const char *fieldpath, unsigned char content)
Definition: adtools.cxx:486
char * GBS_hashtab_2_string(const GB_HASH *hash)
Definition: adhash.cxx:371
long AW_CL
Definition: cb.h:21
#define set_result_bit(s)
GB_ERROR GBT_write_string(GBDATA *gb_container, const char *fieldpath, const char *content)
Definition: adtools.cxx:451
GEN_remove_state
#define NULp
Definition: cxxforward.h:97
GB_CSTR GB_read_char_pntr(GBDATA *gbd)
Definition: arbdb.cxx:874
NOT4PERL char * GBT_read_gene_sequence_and_length(GBDATA *gb_gene, bool use_revComplement, char partSeparator, size_t *gene_length)
Definition: adali.cxx:817
GBDATA * GB_entry(GBDATA *father, const char *key)
Definition: adquery.cxx:334
char * GBS_global_string_copy(const char *templat,...)
Definition: arb_msg.cxx:195
GB_HASH * GBS_create_hash(long estimated_elements, GB_CASE case_sens)
Definition: adhash.cxx:253