ARB
GEN_translations.cxx
Go to the documentation of this file.
1 // =============================================================== //
2 // //
3 // File : GEN_translations.cxx //
4 // Purpose : supports removal of redundant translations of //
5 // gene CDS //
6 // //
7 // Coded by Ralf Westram (coder@reallysoft.de) in January 2009 //
8 // Institute of Microbiology (Technical University Munich) //
9 // http://www.arb-home.de/ //
10 // //
11 // =============================================================== //
12 
13 #include "GEN_local.hxx"
14 
15 #include <Translate.hxx>
16 #include <AP_codon_table.hxx>
17 #include <aw_question.hxx>
18 #include <arbdbt.h>
19 
20 using namespace std;
21 
22 // -------------------------------------------------
23 // remove redundant translations from genes
24 
25 // @@@ add menu-entry to genome-NTREE ("Remove reproducible translations")
26 
27 static char *translate_gene_sequence(GBDATA *gb_gene, GB_ERROR& error, int& translated_length, char *startCodon) {
28  // return translation of gene sequence
29  // the start codon is copied into result buffer 'startCodon' (has to be sized 4 bytes)
30 
31  size_t gene_length;
32  char *gene_seq = GBT_read_gene_sequence_and_length(gb_gene, true, 0, &gene_length);
33  if (!gene_seq) error = GB_await_error();
34  else {
35  // store start codon in result buffer:
36  memcpy(startCodon, gene_seq, 3);
37  startCodon[3] = 0;
38 
39  int arb_transl_table, codon_start;
40  error = translate_getInfo(gb_gene, arb_transl_table, codon_start);
41 
42  if (arb_transl_table == -1) arb_transl_table = TTIT_embl2arb(1); // use embl table 1 (standard code)
43  if (codon_start == -1) codon_start = 0; // default codon start
44 
45  if (!error) translate_nuc2aa(arb_transl_table, gene_seq, gene_length, codon_start, false, true, true, &translated_length);
46 
47  if (error) {
48  free(gene_seq);
49  gene_seq = NULp;
50  }
51  }
52 
53  return gene_seq;
54 }
55 
57  GRS_NO_CHANGE = 0, // no translation found
58  GRS_FAILED = 1, // error is set
59  GRS_TRANSLATION_REMOVED = 2, // translation was present, reproducible and has been removed
60  GRS_TRANSLATION_FAILED = 4, // translation differed (wrote ARB translation to field 'ARB_translation')
61  GRS_START_CODON_WRONG = 8, // translation differed only in start codon
62  GRS_NOTE_ADDED = 16, // note has been added
63 };
64 
65 static GEN_remove_state remove_redundant_translation(GBDATA *gb_gene, bool ignore_start_codon_error, char *errornousCodon, GB_ERROR &error) {
66  // If translation can be re-produced by ARB,
67  // it will be removed
68  // ('ARB_translation' will be removed as well in this case)
69  // Otherwise
70  // a field 'ARB_translation' is inserted, which contains the translation generated by ARB.
71  //
72  // If result is GRS_START_CODON_WRONG, the questionable codon is copied into errornousCodon.
73  // (errornousCodon has to be a buffer with size == 4)
74 
75  // @@@ If another code or codonstart translates fine, a hint shall be written to field 'translation_hint'
76 
78  char *add_note = NULp; // will be added as 'ARB_translation_note' (if set)
79  error = NULp;
80 
81 #define set_result_bit(s) result = GEN_remove_state(result|s)
82 
83  GBDATA *gb_translation = GB_entry(gb_gene, "translation");
84  if (gb_translation) {
85  int translated_length;
86  char *generated = translate_gene_sequence(gb_gene, error, translated_length, errornousCodon);
87 
88  if (!generated || translated_length<1) {
89  // insert note and continue
90  add_note = GBS_global_string_copy("Failed to translate gene-sequence (%s)", error);
91  error = NULp;
93  }
94  else {
95  if (generated[translated_length-1] == '*') {
96  generated[--translated_length] = 0; // cut off stop codon
97  }
98 
99  const char *original = GB_read_char_pntr(gb_translation);
100 
101  bool remove = false;
102  if (strcmp(generated+1, original+1) == 0) { // most of translation matches
103  if (generated[0] == original[0]) { // start codon matches
104  remove = true;
105  }
106  else { // start codon differs
108  remove = ignore_start_codon_error; // and delete if requested
109  }
110  }
111 
112  if (remove) { // remove translation and related entries
113  const char *to_remove[] = {
114  "translation",
115  "ARB_translation",
116  "ARB_translation_note",
117  NULp
118  };
119 
120  GB_ERROR err = NULp;
121  int failed_field = -1;
122 
123  for (int r = 0; to_remove[r] && !err; ++r) {
124  GBDATA *gb_remove = GB_entry(gb_gene, to_remove[r]);
125  if (gb_remove) {
126  err = GB_delete(gb_remove);
127  if (err) failed_field = r;
128  }
129  }
130  if (err) error = GBS_global_string("Failed to delete field '%s' (%s)", to_remove[failed_field], err);
131  else {
132  error = GBT_write_byte(gb_gene, "ARB_translation_rm", 1);
134  }
135  }
136  else {
137  error = GBT_write_string(gb_gene, "ARB_translation", generated);
139  }
140  }
141  free(generated);
142  }
143 
144  if (add_note && !error) {
145  error = GBT_write_string(gb_gene, "ARB_translation_note", add_note);
147  }
148 
149  if (error) result = GRS_FAILED;
150  free(add_note);
151 
152  return result;
153 
154 #undef set_result_bit
155 }
156 
157 GB_ERROR GEN_testAndRemoveTranslations(GBDATA *gb_gene_data, void (*warn)(AW_CL cd, const char *msg), AW_CL cd, AW_repeated_question *ok_to_ignore_wrong_start_codon) {
158  int ok = 0; // identical translations
159  int failed = 0; // non-identical translations
160  int wrong_start_codon = 0; // translations where start_codon differed
161  int no_entry = 0; // genes w/o 'translation' entry
162  int note_added = 0; // count gene for which a note has been added
163  GB_ERROR error = NULp;
164 
165  const int possibleCodons = 4*4*4;
166  GB_HASH *wrongStartCodons = GBS_create_hash(possibleCodons, GB_IGNORE_CASE);
167 
168  for (GBDATA *gb_gene = GB_entry(gb_gene_data, "gene"); gb_gene && !error; gb_gene = GB_nextEntry(gb_gene)) {
169  int retry = 0;
170  for (int Try = 0; Try <= retry && !error; Try++) {
171  error = NULp;
172 
173  char startCodon[4];
174  GEN_remove_state state = remove_redundant_translation(gb_gene, Try, startCodon, error);
175 
176  switch (state) {
177  case GRS_NO_CHANGE:
178  no_entry++;
179  break;
180 
181  case GRS_FAILED:
182  gen_assert(error);
183  break;
184 
185  default:
186  if (state&GRS_TRANSLATION_REMOVED) {
187  ok++;
188  }
189  else {
191  if (Try == 0) {
192  if (state&GRS_START_CODON_WRONG) {
193  wrong_start_codon++;
194  AW_repeated_question* q = ok_to_ignore_wrong_start_codon;
195 
196  if (q->get_answer("only_start_codon_differs",
197  "Translation differs only in start codon",
198  "Ignore and remove,Keep translation", "all", false) == 0) {
199  retry++;
200  }
201  else {
202  failed++;
203  }
204 
205  GBS_incr_hash(wrongStartCodons, startCodon);
206  }
207  else if (state&GRS_NOTE_ADDED) {
208  failed++;
209  note_added++;
210  }
211  }
212  else {
213  failed++;
214  }
215  }
216  break;
217  }
218  }
219  }
220 
221  if (!error && failed>0) {
222  warn(cd, GBS_global_string("%i translations could not be reproduced by ARB", failed));
223  static bool first_warning = true;
224  if (first_warning) { // show details once
225  warn(cd,
226  "Note: Reproducible translations were removed from database.\n"
227  " Failed translations were left in database and an additional\n"
228  " field 'ARB_translation' was added.");
229  warn(cd, GBS_global_string("- %i genes had no translation entry", no_entry));
230  warn(cd, GBS_global_string("- %i translations were reproducible", ok));
231  first_warning = false;
232  }
233  if (wrong_start_codon>0) {
234  char *codonInfo = GBS_hashtab_2_string(wrongStartCodons);
235  warn(cd, GBS_global_string("- %i translations had wrong start codon (%s)", wrong_start_codon, codonInfo));
236  free(codonInfo);
237  }
238  if (note_added>0) {
239  warn(cd, GBS_global_string("- %i ARB_translation_note entries were generated. Please examine!", note_added));
240  }
241  }
242 
243  GBS_free_hash(wrongStartCodons);
244 
245  return error;
246 }
247 
248 
249 
const char * GB_ERROR
Definition: arb_core.h:25
static GEN_remove_state remove_redundant_translation(GBDATA *gb_gene, bool ignore_start_codon_error, char *errornousCodon, GB_ERROR &error)
string result
long GBS_incr_hash(GB_HASH *hs, const char *key)
Definition: adhash.cxx:470
GBDATA * GB_nextEntry(GBDATA *entry)
Definition: adquery.cxx:339
GB_ERROR GEN_testAndRemoveTranslations(GBDATA *gb_gene_data, void(*warn)(AW_CL cd, const char *msg), AW_CL cd, AW_repeated_question *ok_to_ignore_wrong_start_codon)
const char * GBS_global_string(const char *templat,...)
Definition: arb_msg.cxx:202
STL namespace.
void GBS_free_hash(GB_HASH *hs)
Definition: adhash.cxx:541
#define gen_assert(bed)
Definition: GEN_local.hxx:19
GB_ERROR GB_delete(GBDATA *&source)
Definition: arbdb.cxx:1899
int TTIT_embl2arb(int embl_code_nr)
GB_ERROR GB_await_error()
Definition: arb_msg.cxx:341
int translate_nuc2aa(int arb_code_nr, char *data, size_t size, size_t pos, bool translate_all, bool create_start_codon, bool append_stop_codon, int *translatedSize)
Definition: Translate.cxx:108
static void error(const char *msg)
Definition: mkptypes.cxx:96
static char * translate_gene_sequence(GBDATA *gb_gene, GB_ERROR &error, int &translated_length, char *startCodon)
GB_ERROR translate_getInfo(GBDATA *gb_item, int &arb_transl_table, int &codon_start)
Definition: Translate.cxx:48
GB_ERROR GBT_write_byte(GBDATA *gb_container, const char *fieldpath, unsigned char content)
Definition: adtools.cxx:486
char * GBS_hashtab_2_string(const GB_HASH *hash)
Definition: adhash.cxx:371
long AW_CL
Definition: cb.h:21
#define set_result_bit(s)
GB_ERROR GBT_write_string(GBDATA *gb_container, const char *fieldpath, const char *content)
Definition: adtools.cxx:451
GEN_remove_state
#define NULp
Definition: cxxforward.h:97
GB_CSTR GB_read_char_pntr(GBDATA *gbd)
Definition: arbdb.cxx:898
NOT4PERL char * GBT_read_gene_sequence_and_length(GBDATA *gb_gene, bool use_revComplement, char partSeparator, size_t *gene_length)
Definition: adali.cxx:827
GBDATA * GB_entry(GBDATA *father, const char *key)
Definition: adquery.cxx:334
char * GBS_global_string_copy(const char *templat,...)
Definition: arb_msg.cxx:193
GB_HASH * GBS_create_hash(long estimated_elements, GB_CASE case_sens)
Definition: adhash.cxx:253