ARB
AP_pro_a_nucs.cxx
Go to the documentation of this file.
1 // =============================================================== //
2 // //
3 // File : AP_pro_a_nucs.cxx //
4 // Purpose : //
5 // //
6 // Institute of Microbiology (Technical University Munich) //
7 // http://www.arb-home.de/ //
8 // //
9 // =============================================================== //
10 
11 #include "AP_pro_a_nucs.hxx"
12 
13 #include <AP_codon_table.hxx>
14 #include <arbdbt.h>
15 #include <ad_cb.h>
16 #include <arb_str.h>
17 #include <algorithm>
18 
19 #define pn_assert(cond) arb_assert(cond)
20 
22  int i;
23  AP_BASES val;
24  char *table = new char[256];
25 
26  for (i=0; i<256; i++) {
27  switch ((char)i) {
28  case 'a': case 'A': val = AP_A; break;
29  case 'g': case 'G': val = AP_G; break;
30  case 'c': case 'C': val = AP_C; break;
31  case 't': case 'T':
32  case 'u': case 'U': val = AP_T; break;
33  case '-': val = AP_GAP; break;
34  case 'm': case 'M': val = AP_BASES(AP_A + AP_C); break;
35  case 'r': case 'R': val = AP_BASES(AP_A + AP_G); break;
36  case 'w': case 'W': val = AP_BASES(AP_A + AP_T); break;
37  case 's': case 'S': val = AP_BASES(AP_C + AP_G); break;
38  case 'y': case 'Y': val = AP_BASES(AP_C + AP_T); break;
39  case 'k': case 'K': val = AP_BASES(AP_G + AP_T); break;
40  case 'v': case 'V': val = AP_BASES(AP_A + AP_C + AP_G); break;
41  case 'h': case 'H': val = AP_BASES(AP_A + AP_C + AP_T); break;
42  case 'd': case 'D': val = AP_BASES(AP_A + AP_G + AP_T); break;
43  case 'b': case 'B': val = AP_BASES(AP_C + AP_G + AP_T); break;
44  case 'n': case 'N': val = AP_BASES(AP_A + AP_G + AP_C + AP_T); break;
45  case '?': case '.': val = AP_BASES(AP_A + AP_G + AP_C + AP_T + AP_GAP); break; // = AP_DOT
46  default: val = AP_DOT; break; // interpret everything else like a dot (alternative would be to abort with error)
47  }
48  table[i] = (char)val;
49  }
50 
51  pn_assert(table[safeCharIndex('.')] == AP_DOT); // make sure a dot is a dot
52 
53  return table;
54 }
55 
56 long *AWT_translator::create_pro_to_bits() const {
57  long *table = ARB_calloc<long>(256);
58  for (int i = 0; i < max_aa; i++) {
59  int j = index_2_spro[i];
60  if (j == '.') {
61  table[i] = -1;
62  continue;
63  }
64  j = s2str[j]->index;
65  table[i] = 1<<j;
66  }
67  return table;
68 }
69 
70 void AWT_translator::build_table(unsigned char pbase, const char *nuc) {
71  struct arb_r2a_pro_2_nuc *str = s2str[pbase];
72 
73  // search for existing protein, else generate new
74  if (!str) {
75  str = new arb_r2a_pro_2_nuc;
76  s2str[pbase] = str;
77  s2str[tolower(pbase)] = str;
78 
79  str->index = max_aa++;
80  str->single_pro = pbase;
81 
82  index_2_spro[str->index] = pbase;
83  }
84 
85  // fast hash table
86  pn_assert(!GBS_read_hash(t2i_hash, nuc)); // used twice (cannot handle ambiguities, e.g. optional start-/stop-codons)
87  GBS_write_hash(t2i_hash, nuc, pbase);
88 
89  int n0 = nuc_2_bitset[safeCharIndex(nuc[0])];
90  int n1 = nuc_2_bitset[safeCharIndex(nuc[1])];
91  int n2 = nuc_2_bitset[safeCharIndex(nuc[2])];
92 
93  struct arb_r2a_pro_2_nucs *nucs;
94  for (nucs = str->nucs; nucs; nucs = nucs->next) {
95  if ((!(nucs->nucbits[0] & ~n0)) && // search superset
96  (!(nucs->nucbits[1] & ~n1)) &&
97  (!(nucs->nucbits[2] & ~n2))) break;
98 
99  int c = 0;
100  if (nucs->nucbits[0] != n0) c++;
101  if (nucs->nucbits[1] != n1) c++;
102  if (nucs->nucbits[2] != n2) c++;
103  if (c <= 1) break;
104  }
105  if (!nucs) {
106  nucs = new arb_r2a_pro_2_nucs;
107  nucs->next = str->nucs;
108  str->nucs = nucs;
109  }
110  nucs->nucbits[0] |= n0;
111  nucs->nucbits[1] |= n1;
112  nucs->nucbits[2] |= n2;
113 }
114 
115 // ----------------------------
116 // arb_r2a_pro_2_nucs
117 
119  : next(NULp)
120 {
121  memset(nucbits, 0, sizeof(nucbits));
122 }
124  delete next;
125 }
126 
127 // ---------------------------
128 // arb_r2a_pro_2_nuc
129 
131  single_pro(0),
132  index(0),
133  nucs(NULp)
134 {
135 }
137  delete nucs;
138 }
139 
140 // ------------------------
141 // AWT_translator
142 
143 static int codon_defined_in(const char *codon, const char *codons) {
144  for (int off=0; codons[off]; off+=3) {
145  if (codon[0]==codons[off] && codon[1]==codons[off+1] && codon[2]==codons[off+2]) {
146  return 1;
147  }
148  }
149  return 0;
150 }
151 
152 // order of tables generated with build_table() by AWT_translator ctor is important:
153 // must be compatible with DIST/PH_protdist.cxx !!
154 // except that this table has an 's' insertion !!!
155 
156 #define T2I_ENTRIES_MAX 196 // maximum number of generated translations (by code number 11)
157 
158 AWT_translator::AWT_translator(int arb_protein_code_nr) :
159  distance_meter(NULp),
160  code_nr(arb_protein_code_nr),
161  pro_2_bitset(NULp),
162  realmax_aa(0),
163  max_aa(0)
164 {
165  memset(s2str, 0, sizeof(s2str));
166  memset(index_2_spro, 0, sizeof(index_2_spro));
167 
168  nuc_2_bitset = AP_create_dna_to_ap_bases();
169  t2i_hash = GBS_create_hash(T2I_ENTRIES_MAX, GB_IGNORE_CASE); // case insensitive
170 
172 
173  {
174  char *D_codons = strdup(AP_get_codons('D', code_nr));
175  char *N_codons = strdup(AP_get_codons('N', code_nr));
176  char *E_codons = strdup(AP_get_codons('E', code_nr));
177  char *Q_codons = strdup(AP_get_codons('Q', code_nr));
178  char *I_codons = strdup(AP_get_codons('I', code_nr));
179  char *L_codons = strdup(AP_get_codons('L', code_nr));
180 
181  char protein;
182  for (protein='*'; protein<='Z'; protein = (protein=='*' ? 'A' : protein+1)) {
183  if (protein!='O' && protein!='U') { // OU are no aminos
184  const char *codons;
185  if (protein=='D') codons = D_codons;
186  else if (protein=='N') codons = N_codons;
187  else if (protein=='E') codons = E_codons;
188  else if (protein=='Q') codons = Q_codons;
189  else if (protein=='I') codons = I_codons;
190  else if (protein=='L') codons = L_codons;
191  else codons = AP_get_codons(protein, code_nr);
192  // codons now contains a 0-terminated-string containing all possible codons for protein
193 
194  for (int off=0; codons[off]; off+=3) {
195  char codon[4];
196  memcpy(codon, codons+off, 3);
197  codon[3] = 0;
198 
199  if (protein=='B') {
200  if (!codon_defined_in(codon, D_codons) && !codon_defined_in(codon, N_codons)) {
201  build_table(protein, codon);
202  }
203  }
204  else if (protein=='J') {
205  if (!codon_defined_in(codon, I_codons) && !codon_defined_in(codon, L_codons)) {
206  build_table(protein, codon);
207  }
208  }
209  else if (protein=='Z') {
210  if (!codon_defined_in(codon, E_codons) && !codon_defined_in(codon, Q_codons)) {
211  build_table(protein, codon);
212  }
213  }
214  else {
215  build_table(protein, codon);
216  }
217  }
218  }
219  }
220 
221  free(L_codons);
222  free(I_codons);
223  free(Q_codons);
224  free(E_codons);
225  free(N_codons);
226  free(D_codons);
227  }
228 
229  realmax_aa = max_aa;
230 
231  build_table('-', "---");
232  build_table('.', "...");
233  build_table('.', "???");
234  build_table('X', "NNN");
235 
237 
238  pro_2_bitset = create_pro_to_bits();
239 }
240 
242  free(pro_2_bitset);
243 
244  delete [] nuc_2_bitset;
245  GBS_free_hash(t2i_hash);
246  for (int i=0; i<256; i++) {
247  if (i != tolower(i)) continue; // do not delete duplicated entries (a-z == A-Z!)
248  delete s2str[i];
249  }
250 
251  delete distance_meter;
252 }
253 
255  if (!distance_meter) distance_meter = new AWT_distance_meter(this);
256  return distance_meter;
257 }
258 
259 
260 // ----------------------------
261 // Distance functions
262 
263 static int nuc_dist(const AWT_translator *translator, unsigned char p1, unsigned char p2) {
264  // calculate minimum necessary nucleotide-mutations for a given amino-acid-mutation
265 
266  const struct arb_r2a_pro_2_nuc *s1, *s2;
267  s1 = translator->S2str(p1);
268  s2 = translator->S2str(p2);
269  if ((!s1) || (!s2)) return -1;
270  struct arb_r2a_pro_2_nucs *n1, *n2;
271  long mindist = 3;
272  // Check all combinations, if any combination is valid -> zero distance
273  for (n1 = s1->nucs; n1; n1=n1->next) {
274  for (n2 = s2->nucs; n2; n2=n2->next) {
275  int dist = 0;
276  int i;
277  for (i=0; i<3; i++) {
278  if (n1->nucbits[i] & n2->nucbits[i]) continue;
279  dist++;
280  }
281  if (dist< mindist) mindist = dist;
282  }
283  }
284  return mindist;
285 }
286 
287 #if defined(DEBUG)
288 static void awt_pro_a_nucs_debug(const AWT_translator *translator, const AWT_distance_meter *distmeter) {
289  int max_aa = translator->MaxAA();
290  int realmax_aa = translator->RealmaxAA();
291 
292  for (int s = 0; s< max_aa; s++) {
293  const AWT_PDP *dist = distmeter->getDistance(s);
294 
295  // check bits should not be present in distpad
296  if (s<realmax_aa) {
297  for (int i=0; i<2; i++) {
298  // assertion: bit is set in patd[i] -> bit is clear in patd[i+1]
299  assert_or_exit((dist->patd[i] & ~dist->patd[i+1]) == 0);
300  }
301  }
302  printf("Base %c[%i]: Dist to ", translator->index2spro(s), s);
303  for (int d = 0; d< max_aa; d++) {
304  int i;
305  for (i=0; i<3; i++) {
306  if (dist->patd[i] & (1<<d)) break;
307  }
308  printf ("%c%i ", translator->index2spro(d), i);
309  }
310  printf ("\n");
311  }
312 }
313 #endif // DEBUG
314 
315 // ----------------------------
316 // AWT_distance_meter
317 
319  memset(dist_, 0, sizeof(dist_));
320 
321  int s;
322  int i;
323  int max_aa = translator->MaxAA();
324  int realmax_aa = translator->RealmaxAA();
325 
326  for (s = 0; s< max_aa; s++) {
327  ARB_calloc(dist_[s], max_aa);
328 
329  const arb_r2a_pro_2_nuc *s2str = translator->S2str(translator->index2spro(s));
330  for (i=0; i<3; i++) {
331  dist_[s]->nucbits[i] = s2str->nucs->nucbits[i];
332  }
333  }
334 
335  for (s = 0; s< max_aa; s++) {
336  for (int d = 0; d< realmax_aa; d++) {
337  int dist = nuc_dist(translator, translator->index2spro(s), translator->index2spro(d));
338 
339  if (dist==0) dist_[s]->patd[0] |= 1<<d; // distance == 0
340  if (dist<=1) dist_[s]->patd[1] |= 1<<d; // distance <= 1
341  }
342  dist_[s]->patd[2] |= dist_[s]->patd[1]; // (distance <= 1) => (distance <= 2)
343  dist_[s]->patd[0] |= 1<<s; // set 'distance to self' == 0
344  }
345 
346  for (s = 0; s< max_aa; s++) {
347  long sum = 0;
348  for (int d = 0; d< realmax_aa; d++) {
349  if ((1 << d) & dist_[s]->patd[1]) { // if distance(s, d) <= 1
350  sum |= dist_[d]->patd[1]; // collect all proteins which have 'distance <= 1' to 'd'
351  }
352  }
353  dist_[s]->patd[2] |= sum; // and store them in 'distance <= 2'
354  }
355 
356 #ifdef DEBUG
357  awt_pro_a_nucs_debug(translator, this);
358 #endif
359 }
360 
362  for (int i=0; i<64; i++) {
363  free(dist_[i]);
364  }
365 
366 }
367 
368 // --------------------------------------------------------------------------------
369 // Translator factory:
370 
371 static int current_user_code_nr = -1; // always contain same value as AWAR_PROTEIN_TYPE (after calling AWT_default_protein_type once)
372 
373 static void user_code_nr_changed_cb(GBDATA *gb_awar) {
374  // this callback keeps 'current_user_code_nr' synced with AWAR_PROTEIN_TYPE
375  GBDATA *gb_main = GB_get_root(gb_awar);
376  GB_transaction ta(gb_main);
377  current_user_code_nr = GB_read_int(gb_awar);
378 }
379 
380 #define CACHED_TRANSLATORS 4
381 
384 
385  if (cached[0].isNull() || cached[0]->CodeNr() != code_nr) { // most recent != requested
386  SmartPtr<AWT_translator> translator;
387 
388  int i;
389  for (i = 1; i<CACHED_TRANSLATORS; i++) {
390  if (cached[i].isSet() && cached[i]->CodeNr() == code_nr) {
391  // found existing translator
392  translator = cached[i];
393  cached[i].setNull();
394  break;
395  }
396  }
397 
398  if (translator.isNull()) {
399  translator = new AWT_translator(code_nr);
400  }
401 
402  // insert new or found translator at front and shift existing to higher indices:
403  for (i = 0; i<CACHED_TRANSLATORS && translator.isSet(); i++) {
404  std::swap(cached[i], translator);
405  }
406 
407  // deletes oldest translator, if no empty array position was found:
408  }
409 
410  pn_assert(cached[0]->CodeNr() == code_nr);
411  return &*cached[0];
412 }
413 
415  // returns protein code selected in AWAR_PROTEIN_TYPE
416 
417  if (current_user_code_nr == -1) { // user protein code in AWAR_PROTEIN_TYPE not traced yet
418  pn_assert(gb_main); // either pass gb_main here or call once with valid gb_main at program startup
419 
420  {
421  GB_transaction ta(gb_main);
422  GBDATA *awar = GB_search(gb_main, AWAR_PROTEIN_TYPE, GB_INT);
423  GB_add_callback(awar, GB_CB_CHANGED, makeDatabaseCallback(user_code_nr_changed_cb)); // bind a callback that traces AWAR_PROTEIN_TYPE
425  }
426 
427  pn_assert(current_user_code_nr != -1);
428  }
429 
430  return current_user_code_nr;
431 }
432 
435 }
436 
437 // --------------------------------------------------------------------------------
438 
439 #ifdef UNIT_TESTS
440 #ifndef TEST_UNIT_H
441 #include <test_unit.h>
442 #endif
443 
444 static int test_code_nr = -1;
445 #if defined(ENABLE_CRASH_TESTS)
446 static void translator_instance() {
447  AWT_translator instance(test_code_nr);
448 }
449 #endif
450 
451 void TEST_translator_instantiation() {
452  for (int i = 0; i<AWT_CODON_TABLES; ++i) {
453  TEST_ANNOTATE(GBS_global_string("i=%i", i));
454  test_code_nr = i;
455  TEST_EXPECT_NO_SEGFAULT(translator_instance);
456  }
457 }
458 
459 #endif // UNIT_TESTS
460 
461 // --------------------------------------------------------------------------------
static int current_user_code_nr
char * AP_create_dna_to_ap_bases()
long GB_read_int(GBDATA *gbd)
Definition: arbdb.cxx:729
const AWT_distance_meter * getDistanceMeter() const
const char * AP_get_codons(char protein, int code_nr)
long GBS_write_hash(GB_HASH *hs, const char *key, long val)
Definition: adhash.cxx:454
CONSTEXPR_INLINE unsigned char safeCharIndex(char c)
Definition: dupstr.h:73
AWT_distance_meter(const AWT_translator *translator)
GB_ERROR GB_add_callback(GBDATA *gbd, GB_CB_TYPE type, const DatabaseCallback &dbcb)
Definition: ad_cb.cxx:356
const char * GBS_global_string(const char *templat,...)
Definition: arb_msg.cxx:203
void GBS_free_hash(GB_HASH *hs)
Definition: adhash.cxx:538
bool isNull() const
test if SmartPtr is NULp
Definition: smartptr.h:248
int MaxAA() const
void setNull()
set SmartPtr to NULp
Definition: smartptr.h:251
AP_BASES
#define CACHED_TRANSLATORS
struct arb_r2a_pro_2_nucs * nucs
bool isSet() const
test if SmartPtr is not NULp
Definition: smartptr.h:245
int RealmaxAA() const
static void user_code_nr_changed_cb(GBDATA *gb_awar)
Generic smart pointer.
Definition: smartptr.h:149
GBDATA * GB_get_root(GBDATA *gbd)
Definition: arbdb.cxx:1740
struct arb_r2a_pro_2_nucs * next
CONSTEXPR_INLINE_Cxx14 void swap(unsigned char &c1, unsigned char &c2)
Definition: ad_io_inline.h:19
char * str
Definition: defines.h:20
size_t GBS_hash_elements(const GB_HASH *hs)
Definition: adhash.cxx:570
AWT_translator * AWT_get_user_translator(GBDATA *gb_main)
TYPE * ARB_calloc(size_t nelem)
Definition: arb_mem.h:81
static int codon_defined_in(const char *codon, const char *codons)
#define assert_or_exit(cond)
Definition: arb_assert.h:263
#define TEST_EXPECT_NO_SEGFAULT(cb)
Definition: test_unit.h:1250
AWT_translator(int arb_protein_code_nr)
#define AWAR_PROTEIN_TYPE
#define AWT_CODON_TABLES
int AWT_default_protein_type(GBDATA *gb_main)
#define NULp
Definition: cxxforward.h:116
long patd[3]
static int nuc_dist(const AWT_translator *translator, unsigned char p1, unsigned char p2)
#define T2I_ENTRIES_MAX
GB_transaction ta(gb_var)
GBDATA * gb_main
Definition: adname.cxx:32
void AP_initialize_codon_tables()
GBDATA * GB_search(GBDATA *gbd, const char *fieldpath, GB_TYPES create)
Definition: adquery.cxx:531
#define pn_assert(cond)
unsigned char index2spro(int index) const
long GBS_read_hash(const GB_HASH *hs, const char *key)
Definition: adhash.cxx:392
const AWT_PDP * getDistance(int idx) const
GB_HASH * GBS_create_hash(long estimated_elements, GB_CASE case_sens)
Definition: adhash.cxx:253
AWT_translator * AWT_get_translator(int code_nr)
const arb_r2a_pro_2_nuc * S2str(int index) const
Definition: arbdb.h:66
char nucbits[3]
GB_write_int const char s
Definition: AW_awar.cxx:154