ARB
ST_ml.cxx
Go to the documentation of this file.
1 // =============================================================== //
2 // //
3 // File : ST_ml.cxx //
4 // Purpose : //
5 // //
6 // Institute of Microbiology (Technical University Munich) //
7 // http://www.arb-home.de/ //
8 // //
9 // =============================================================== //
10 
11 #include "st_ml.hxx"
12 #include "MostLikelySeq.hxx"
13 
14 #include <ColumnStat.hxx>
15 #include <AP_filter.hxx>
16 #include <AP_Tree.hxx>
17 #include <arb_progress.h>
18 #include <gui_aliview.hxx>
19 #include <ad_cb.h>
20 
21 #include <cctype>
22 #include <cmath>
23 
25 
27  int i;
28  for (i = 0; i < 256; i++) {
29  switch (toupper(i)) {
30  case 'A':
31  char_to_enum_table[i] = ST_A;
32  break;
33  case 'C':
34  char_to_enum_table[i] = ST_C;
35  break;
36  case 'G':
37  char_to_enum_table[i] = ST_G;
38  break;
39  case 'T':
40  case 'U':
41  char_to_enum_table[i] = ST_T;
42  break;
43  case '-':
44  char_to_enum_table[i] = ST_GAP;
45  break;
46  default:
47  char_to_enum_table[i] = ST_UNKNOWN;
48  }
49  }
50 }
51 
52 // -----------------------
53 // ST_base_vector
54 
55 void ST_base_vector::setBase(const ST_base_vector& inv_frequencies, char base) {
56  base = toupper(base);
57 
58  memset((char *) &b[0], 0, sizeof(b));
59  DNA_Base ub = dna_table.char_to_enum(base);
60 
61  if (ub != ST_UNKNOWN) {
62  b[ub] = 1.0; // ill. access ?
63  }
64  else {
65  const double k = 1.0 / ST_MAX_BASE;
66  b[ST_A] = k;
67  b[ST_C] = k;
68  b[ST_G] = k;
69  b[ST_T] = k;
70  b[ST_GAP] = k;
71  }
72  for (int i = 0; i < ST_MAX_BASE; i++) { // LOOP_VECTORIZED[!<5.0]
73  b[i] *= inv_frequencies.b[i];
74  }
75  ld_lik = 0; // ? why not 1.0 ?
76  lik = 1.0;
77 }
78 
80  ST_FLOAT sum = summarize();
81 
82  if (sum < .00001) { // what happend no data, extremely unlikely
83  setTo(0.25); // strange! shouldn't this be 1.0/ST_MAX_BASE ?
84  ld_lik -= 5; // ???
85  }
86  else {
87  while (sum < 0.25) {
88  sum *= 4;
89  ld_lik -= 2;
90  multiplyWith(4);
91  }
92  }
93 
94  if (ld_lik> 10000) printf("overflow\n");
95 }
96 
98  b[ST_A] *= other.b[ST_A];
99  b[ST_C] *= other.b[ST_C];
100  b[ST_G] *= other.b[ST_G];
101  b[ST_T] *= other.b[ST_T];
102  b[ST_GAP] *= other.b[ST_GAP];
103 
104  ld_lik += other.ld_lik; // @@@ correct to use 'plus' here ? why ?
105  lik *= other.lik;
106 
107  return *this;
108 }
109 
111  int i;
112  for (i = 0; i < ST_MAX_BASE; i++) {
113  printf("%.3G ", b[i]);
114  }
115 }
116 
117 // -----------------------
118 // ST_rate_matrix
119 
120 void ST_rate_matrix::set(double dist, double /* TT_ratio */) {
121  const double k = 1.0 / ST_MAX_BASE;
122  ST_FLOAT exp_dist = exp(-dist);
123 
124  diag = k + (1.0 - k) * exp_dist;
125  rest = k - k * exp_dist;
126 }
127 
129  for (int i = 0; i < ST_MAX_BASE; i++) {
130  for (int j = 0; j < ST_MAX_BASE; j++) {
131  printf("%.3G ", i == j ? diag : rest);
132  }
133  printf("\n");
134  }
135 }
136 
137 
138 inline void ST_rate_matrix::transform(const ST_base_vector& in, ST_base_vector& out) const {
139  // optimized matrix/vector multiplication
140  // original version: http://bugs.arb-home.de/browser/trunk/STAT/ST_ml.cxx?rev=6403#L155
141 
142  ST_FLOAT sum = in.summarize();
143  ST_FLOAT diag_rest_diff = diag-rest;
144  ST_FLOAT sum_rest_prod = sum*rest;
145 
146  out.b[ST_A] = in.b[ST_A]*diag_rest_diff + sum_rest_prod;
147  out.b[ST_C] = in.b[ST_C]*diag_rest_diff + sum_rest_prod;
148  out.b[ST_G] = in.b[ST_G]*diag_rest_diff + sum_rest_prod;
149  out.b[ST_T] = in.b[ST_T]*diag_rest_diff + sum_rest_prod;
150  out.b[ST_GAP] = in.b[ST_GAP]*diag_rest_diff + sum_rest_prod;
151 
152  out.ld_lik = in.ld_lik;
153  out.lik = in.lik;
154 }
155 
156 
157 // -----------------------
158 // MostLikelySeq
159 
160 MostLikelySeq::MostLikelySeq(const AliView *aliview, ST_ML *st_ml_) :
161  AP_sequence(aliview),
162  st_ml(st_ml_),
163  sequence(new ST_base_vector[ST_MAX_SEQ_PART]),
164  up_to_date(false),
165  color_out(NULp),
166  color_out_valid_till(NULp)
167 {}
168 
170  delete [] sequence;
171  free(color_out);
172  free(color_out_valid_till);
173 
174  unbind_from_species(true);
175 }
176 
178  seq->sequence_change();
179 }
180 
182  seq->unbind_from_species(false);
183 }
184 
185 
188  if (!error) {
189  GBDATA *gb_seq = get_bound_species_data();
190  st_assert(gb_seq);
191 
192  error = GB_add_callback(gb_seq, GB_CB_CHANGED, makeDatabaseCallback(st_sequence_callback, this));
193  if (!error) error = GB_add_callback(gb_seq, GB_CB_DELETE, makeDatabaseCallback(st_sequence_del_callback, this));
194  }
195  return error;
196 }
197 void MostLikelySeq::unbind_from_species(bool remove_callbacks) {
198  GBDATA *gb_seq = get_bound_species_data();
199 
200  if (gb_seq) {
201  if (remove_callbacks) {
202  GB_remove_callback(gb_seq, GB_CB_CHANGED, makeDatabaseCallback(st_sequence_callback, this));
203  GB_remove_callback(gb_seq, GB_CB_DELETE, makeDatabaseCallback(st_sequence_del_callback, this));
204  }
206  }
207 }
208 
210  st_ml->clear_all();
211 }
212 
214  return new MostLikelySeq(get_aliview(), st_ml);
215 }
216 
217 void MostLikelySeq::set(const char *) {
218  st_assert(0); // hmm why not perform set_sequence() here ?
219 }
220 
221 void MostLikelySeq::unset() {
222 }
223 
229  GBDATA *gb_data = get_bound_species_data();
230  st_assert(gb_data);
231 
232  size_t source_sequence_len = (size_t)GB_read_string_count(gb_data);
233  const char *source_sequence = GB_read_char_pntr(gb_data) + st_ml->get_first_pos();
234  ST_base_vector *dest = sequence;
235  const ST_base_vector *freq = st_ml->get_inv_base_frequencies() + st_ml->get_first_pos();
236 
237  size_t range_len = st_ml->get_last_pos() - st_ml->get_first_pos();
238  size_t data_len = std::min(range_len, source_sequence_len);
239  size_t pos = 0;
240 
241  for (; pos<data_len; ++pos) dest[pos].setBase(freq[pos], toupper(source_sequence[pos]));
242  for (; pos<range_len; ++pos) dest[pos].setBase(freq[pos], '.');
243 
244  up_to_date = true;
245 }
246 
247 void MostLikelySeq::calculate_ancestor(const MostLikelySeq *lefts, double leftl, const MostLikelySeq *rights, double rightl) {
248  st_assert(!up_to_date);
249 
250  ST_base_vector hbv;
251  double lc = leftl / st_ml->get_step_size();
252  double rc = rightl / st_ml->get_step_size();
253  const ST_base_vector *lb = lefts->sequence;
254  const ST_base_vector *rb = rights->sequence;
255  ST_base_vector *dest = sequence;
256 
257  for (size_t pos = st_ml->get_first_pos(); pos < st_ml->get_last_pos(); pos++) {
258  st_assert(lb->lik == 1 && rb->lik == 1);
259 
260  int distl = (int) (st_ml->get_rate_at(pos) * lc);
261  int distr = (int) (st_ml->get_rate_at(pos) * rc);
262 
263  st_ml->get_matrix_for(distl).transform(*lb, *dest);
264  st_ml->get_matrix_for(distr).transform(*rb, hbv);
265 
266  *dest *= hbv;
267  dest->check_overflow();
268 
269  st_assert(dest->lik == 1);
270 
271  dest++;
272  lb++;
273  rb++;
274  }
275 
276  up_to_date = true;
277 }
278 
280 
281 void MostLikelySeq::calc_out(const MostLikelySeq *next_branch, double dist) {
282  // result will be in tmp_out
283 
284  ST_base_vector *out = tmp_out + st_ml->get_first_pos();
285  double lc = dist / st_ml->get_step_size();
286  ST_base_vector *lefts = next_branch->sequence;
287 
288  for (size_t pos = st_ml->get_first_pos(); pos < st_ml->get_last_pos(); pos++) {
289  int distl = (int) (st_ml->get_rate_at(pos) * lc);
290  st_ml->get_matrix_for(distl).transform(*lefts, *out);
291 
292  // correct frequencies
293 #if defined(WARN_TODO)
294 #warning check if st_ml->get_base_frequency_at(pos).lik is 1 - if so, use vec-mult here
295 #endif
296  for (int i = ST_A; i < ST_MAX_BASE; i++) {
297  out->b[i] *= st_ml->get_base_frequency_at(pos).b[i];
298  }
299 
300  lefts++;
301  out++;
302  }
303 }
304 
306  const char *data = GB_read_char_pntr(get_bound_species_data());
307  for (size_t i = 0; i < ST_MAX_SEQ_PART; i++) {
308  printf("POS %3zu %c ", i, data[i]);
309  printf("\n");
310  }
311 }
312 
313 // --------------
314 // ST_ML
315 
316 ST_ML::ST_ML(GBDATA *gb_maini) :
318  hash_2_ap_tree(NULp),
319  keep_species_hash(NULp),
320  refresh_n(0),
321  not_valid(NULp),
322  tree_root(NULp),
323  latest_modification(0),
324  first_pos(0),
325  last_pos(0),
326  postcalc_cb(NULp),
327  cb_window(NULp),
328  gb_main(gb_maini),
329  column_stat(NULp),
330  rates(NULp),
331  ttratio(NULp),
332  base_frequencies(NULp),
333  inv_base_frequencies(NULp),
334  max_dist(0.0),
335  step_size(0.0),
336  max_rate_matrices(0),
337  rate_matrices(NULp),
338  is_initialized(false)
339 {}
340 
342  delete tree_root;
343  free(alignment_name);
344  if (hash_2_ap_tree) GBS_free_hash(hash_2_ap_tree);
345  delete not_valid;
346  delete [] base_frequencies;
347  delete [] inv_base_frequencies;
348  delete [] rate_matrices;
349  if (!column_stat) {
350  // rates and ttratio have been allocated (see ST_ML::calc_st_ml)
351  delete [] rates;
352  delete [] ttratio;
353  }
354 }
355 
356 
357 void ST_ML::create_frequencies() {
359 
360  size_t filtered_length = get_filtered_length();
361  base_frequencies = new ST_base_vector[filtered_length];
362  inv_base_frequencies = new ST_base_vector[filtered_length];
363 
364  if (!column_stat) {
365  for (size_t i = 0; i < filtered_length; i++) {
366  base_frequencies[i].setTo(1.0);
367  base_frequencies[i].lik = 1.0;
368 
369  inv_base_frequencies[i].setTo(1.0);
370  inv_base_frequencies[i].lik = 1.0;
371  }
372  }
373  else {
374  for (size_t i = 0; i < filtered_length; i++) {
375  const ST_FLOAT NO_FREQ = 0.01;
376  ST_base_vector& base_freq = base_frequencies[i];
377 
378  base_freq.setTo(NO_FREQ);
379 
380  static struct {
381  unsigned char c;
382  DNA_Base b;
383  } toCount[] = {
384  { 'A', ST_A }, { 'a', ST_A },
385  { 'C', ST_C }, { 'c', ST_C },
386  { 'G', ST_G }, { 'g', ST_G },
387  { 'T', ST_T }, { 't', ST_T },
388  { 'U', ST_T }, { 'u', ST_T },
389  { '-', ST_GAP },
390  { 0, ST_UNKNOWN },
391  };
392 
393  for (int j = 0; toCount[j].c; ++j) {
394  const float *freq = column_stat->get_frequencies(toCount[j].c);
395  if (freq) base_freq.b[toCount[j].b] += freq[i];
396  }
397 
398  ST_FLOAT sum = base_freq.summarize();
399  ST_FLOAT smooth = sum*0.01; // smooth by %1 to avoid "crazy values"
400  base_freq.increaseBy(smooth);
401 
402  sum += smooth*ST_MAX_BASE; // correct sum
403 
404  ST_FLOAT min = base_freq.min_frequency();
405 
406  // @@@ if min == 0.0 all inv_base_frequencies will be set to inf ? correct ?
407  // maybe min should be better calculated after next if-else-clause ?
408 
409  if (sum>NO_FREQ) {
410  base_freq.multiplyWith(ST_MAX_BASE/sum);
411  }
412  else {
413  base_freq.setTo(1.0); // columns w/o data
414  }
415 
416  base_freq.lik = 1.0;
417 
418  inv_base_frequencies[i].makeInverseOf(base_freq, min);
419  inv_base_frequencies[i].lik = 1.0;
420  }
421  }
422 }
423 
424 void ST_ML::insert_tree_into_hash_rek(AP_tree *node) {
425  node->gr.gc = 0;
426  if (node->is_leaf()) {
427  GBS_write_hash(hash_2_ap_tree, node->name, (long) node);
428  }
429  else {
430  insert_tree_into_hash_rek(node->get_leftson());
431  insert_tree_into_hash_rek(node->get_rightson());
432  }
433 }
434 
435 void ST_ML::create_matrices(double max_disti, int nmatrices) {
436  delete [] rate_matrices;
437  rate_matrices = new ST_rate_matrix[nmatrices]; // LOOP_VECTORIZED
438 
439  max_dist = max_disti;
440  max_rate_matrices = nmatrices;
441  step_size = max_dist / max_rate_matrices;
442 
443  for (int i = 0; i < max_rate_matrices; i++) {
444  rate_matrices[i].set((i + 1) * step_size, 0); // ttratio[i]
445  }
446 }
447 
448 long ST_ML::delete_species(const char *key, long val, void *cd_st_ml) {
449  ST_ML *st_ml = (ST_ML*)cd_st_ml;
450 
451  if (GBS_read_hash(st_ml->keep_species_hash, key)) {
452  return val;
453  }
454  else {
455  AP_tree *leaf = (AP_tree *)val;
456  UNCOVERED();
457  destroy(leaf->REMOVE());
458 
459  return 0;
460  }
461 }
462 
463 inline GB_ERROR tree_size_ok(AP_tree_root *tree_root) {
464  GB_ERROR error = NULp;
465 
466  AP_tree *root = tree_root->get_root_node();
467  if (!root || root->is_leaf()) {
468  const char *tree_name = tree_root->get_tree_name();
469  error = GBS_global_string("Too few species remained in tree '%s'", tree_name);
470  }
471  return error;
472 }
473 
475  freenull(alignment_name);
476 
477  if (MostLikelySeq::tmp_out) {
478  delete MostLikelySeq::tmp_out;
479  MostLikelySeq::tmp_out = NULp;
480  }
481 
482  delete tree_root;
483  tree_root = NULp;
484 
485  if (hash_2_ap_tree) {
486  GBS_free_hash(hash_2_ap_tree);
487  hash_2_ap_tree = NULp;
488  }
489 
490  is_initialized = false;
491 }
492 
493 GB_ERROR ST_ML::calc_st_ml(const char *tree_name, const char *alignment_namei,
494  const char *species_names, int marked_only,
495  ColumnStat *colstat, const WeightedFilter *weighted_filter)
496 {
497  // acts as contructor, leaks as hell when called twice
498 
499  GB_ERROR error = NULp;
500 
501  if (is_initialized) cleanup();
502 
503  {
504  GB_transaction ta(gb_main);
505  arb_progress progress("Activating column statistic");
506 
507  column_stat = colstat;
508  GB_ERROR column_stat_error = column_stat->calculate(NULp);
509 
510  if (column_stat_error) fprintf(stderr, "Column statistic error: %s (using equal rates/tt-ratio for all columns)\n", column_stat_error);
511 
512  alignment_name = ARB_strdup(alignment_namei);
513  long ali_len = GBT_get_alignment_len(gb_main, alignment_name);
514 
515  if (ali_len<0) {
516  error = GB_await_error();
517  }
518  else if (ali_len<10) {
519  error = "alignment too short";
520  }
521  else {
522  {
523  AliView *aliview = NULp;
524  if (weighted_filter) {
525  aliview = weighted_filter->create_aliview(alignment_name, error);
526  }
527  else {
528  AP_filter filter(ali_len); // unfiltered
529 
530  error = filter.is_invalid();
531  if (!error) {
532  AP_weights weights(&filter);
533  aliview = new AliView(gb_main, filter, weights, alignment_name);
534  }
535  }
536 
537  st_assert(contradicted(aliview, error));
538 
539  if (!error) {
540  MostLikelySeq *seq_templ = new MostLikelySeq(aliview, this); // @@@ error: never freed! (should be freed when freeing tree_root!)
541  tree_root = new AP_tree_root(aliview, seq_templ, false, NULp);
542  // do not delete 'aliview' or 'seq_templ' (they belong to 'tree_root' now)
543  }
544  }
545 
546  if (!error) {
547  tree_root->loadFromDB(tree_name); // tree is not linked!
548 
549  if (!tree_root->get_root_node()) { // no tree
550  error = GBS_global_string("Failed to load tree '%s'", tree_name);
551  }
552  else {
553  {
554  size_t species_in_tree = count_species_in_tree();
555  hash_2_ap_tree = GBS_create_hash(species_in_tree, GB_MIND_CASE);
556  }
557 
558  // delete species from tree:
559  if (species_names) { // keep names
560  tree_root->remove_leafs(AWT_REMOVE_ZOMBIES);
561 
562  error = tree_size_ok(tree_root);
563  if (!error) {
564  char *l, *n;
565  keep_species_hash = GBS_create_hash(GBT_get_species_count(gb_main), GB_MIND_CASE);
566  for (l = (char *) species_names; l; l = n) {
567  n = strchr(l, 1);
568  if (n) *n = 0;
569  GBS_write_hash(keep_species_hash, l, 1);
570  if (n) *(n++) = 1;
571  }
572 
573  insert_tree_into_hash_rek(tree_root->get_root_node());
574  GBS_hash_do_loop(hash_2_ap_tree, delete_species, this);
575  GBS_free_hash(keep_species_hash);
576  keep_species_hash = NULp;
577  GBT_link_tree(tree_root->get_root_node(), gb_main, true, NULp, NULp);
578  }
579  }
580  else { // keep marked/all
581  GBT_link_tree(tree_root->get_root_node(), gb_main, true, NULp, NULp);
582  tree_root->remove_leafs(marked_only ? AWT_KEEP_MARKED : AWT_REMOVE_ZOMBIES);
583 
584  error = tree_size_ok(tree_root);
585  if (!error) insert_tree_into_hash_rek(tree_root->get_root_node());
586  }
587  }
588  }
589 
590  if (!error) {
591  // calc frequencies
592 
593  progress.subtitle("calculating frequencies");
594 
595  size_t filtered_length = get_filtered_length();
596  if (!column_stat_error) {
597  rates = column_stat->get_rates();
598  ttratio = column_stat->get_ttratio();
599  }
600  else {
601  float *alloc_rates = new float[filtered_length];
602  float *alloc_ttratio = new float[filtered_length];
603 
604  for (size_t i = 0; i < filtered_length; i++) { // LOOP_VECTORIZED
605  alloc_rates[i] = 1.0;
606  alloc_ttratio[i] = 2.0;
607  }
608  rates = alloc_rates;
609  ttratio = alloc_ttratio;
610 
611  column_stat = NULp; // mark rates and ttratio as "allocated" (see ST_ML::~ST_ML)
612  }
613  create_frequencies();
614  latest_modification = GB_read_clock(gb_main); // set update time
615  create_matrices(2.0, 1000);
616 
617  MostLikelySeq::tmp_out = new ST_base_vector[filtered_length]; // @@@ error: never freed!
618  is_initialized = true;
619  }
620  }
621 
622  if (error) {
623  cleanup();
624  error = ta.close(error);
625  }
626  }
627  return error;
628 }
629 
630 MostLikelySeq *ST_ML::getOrCreate_seq(AP_tree *node) {
632  if (!seq) {
633  seq = new MostLikelySeq(tree_root->get_aliview(), this); // @@@ why not use dup() ?
634 
635  node->set_seq(seq);
636  if (node->is_leaf()) {
637  st_assert(node->gb_node);
638  seq->bind_to_species(node->gb_node);
639  }
640  }
641  return seq;
642 }
643 
644 const MostLikelySeq *ST_ML::get_mostlikely_sequence(AP_tree *node) {
648  MostLikelySeq *seq = getOrCreate_seq(node);
649  if (!seq->is_up_to_date()) {
650  if (node->is_leaf()) {
651  seq->set_sequence();
652  }
653  else {
654  const MostLikelySeq *leftSeq = get_mostlikely_sequence(node->get_leftson());
655  const MostLikelySeq *rightSeq = get_mostlikely_sequence(node->get_rightson());
656 
657  seq->calculate_ancestor(leftSeq, node->leftlen, rightSeq, node->rightlen);
658  }
659  }
660 
661  return seq;
662 }
663 
665  GB_transaction ta(gb_main);
666  undo_tree(tree_root->get_root_node());
667  latest_modification = GB_read_clock(gb_main);
668 }
669 
670 void ST_ML::undo_tree(AP_tree *node) {
671  MostLikelySeq *seq = getOrCreate_seq(node);
672  seq->forget_sequence();
673  if (!node->is_leaf()) {
674  undo_tree(node->get_leftson());
675  undo_tree(node->get_rightson());
676  }
677 }
678 
679 #define GET_ML_VECTORS_BUG_WORKAROUND_INCREMENT (ST_MAX_SEQ_PART-1) // workaround bug in get_ml_vectors
680 
681 MostLikelySeq *ST_ML::get_ml_vectors(const char *species_name, AP_tree *node, size_t start_ali_pos, size_t end_ali_pos) {
682  /* result will be in tmp_out
683  *
684  * assert end_ali_pos - start_ali_pos < ST_MAX_SEQ_PART
685  *
686  * @@@ CAUTION!!! get_ml_vectors has a bug:
687  * it does not calculate the last value, if (end_ali_pos-start_ali_pos+1)==ST_MAX_SEQ_PART
688  * (search for GET_ML_VECTORS_BUG_WORKAROUND_INCREMENT)
689  *
690  * I'm not sure whether this is really a bug! Maybe it's only some misunderstanding about
691  * 'end_ali_pos', because it does not mark the last calculated position, but the position
692  * behind the last calculated position! @@@ Need to rename it!
693  *
694  */
695 
696  if (!node) {
697  if (!hash_2_ap_tree) return NULp;
698  node = (AP_tree *) GBS_read_hash(hash_2_ap_tree, species_name);
699  if (!node) return NULp;
700  }
701 
702  st_assert(start_ali_pos<end_ali_pos);
703  st_assert((end_ali_pos - start_ali_pos + 1) <= ST_MAX_SEQ_PART);
704 
705  MostLikelySeq *seq = getOrCreate_seq(node);
706 
707  if (start_ali_pos != first_pos || end_ali_pos > last_pos) {
708  undo_tree(tree_root->get_root_node()); // undo everything
709  first_pos = start_ali_pos;
710  last_pos = end_ali_pos;
711  }
712 
713  AP_tree *pntr;
714  for (pntr = node->get_father(); pntr; pntr = pntr->get_father()) {
715  MostLikelySeq *sequ = getOrCreate_seq(pntr);
716  if (sequ) sequ->forget_sequence();
717  }
718 
719  node->set_root();
720 
721  const MostLikelySeq *seq_of_brother = get_mostlikely_sequence(node->get_brother());
722 
723  seq->calc_out(seq_of_brother, node->father->leftlen + node->father->rightlen);
724  return seq;
725 }
726 
727 bool ST_ML::update_ml_likelihood(char *result[4], int& latest_update, const char *species_name, AP_tree *node) {
741  st_assert(contradicted(species_name, node));
742 
743  if (latest_update < latest_modification) {
744  if (!node) { // if node isn't given search it using species name
745  st_assert(hash_2_ap_tree); // ST_ML was not prepared for search-by-name
746  if (hash_2_ap_tree) node = (AP_tree *) GBS_read_hash(hash_2_ap_tree, species_name);
747  if (!node) return false;
748  }
749 
750  DNA_Base adb[4];
751  int i;
752 
753  size_t ali_len = get_alignment_length();
754  st_assert(get_filtered_length() == ali_len); // assume column stat was calculated w/o filters
755 
756  if (!result[0]) { // allocate Array-elements for result
757  for (i = 0; i < 4; i++) {
758  ARB_calloc(result[i], ali_len+1); // [0 .. alignment_len[ + zerobyte
759  }
760  }
761 
762  for (i = 0; i < 4; i++) {
763  adb[i] = dna_table.char_to_enum("ACGU"[i]);
764  }
765 
766  for (size_t seq_start = 0; seq_start < ali_len; seq_start += GET_ML_VECTORS_BUG_WORKAROUND_INCREMENT) {
767  size_t seq_end = std::min(ali_len, seq_start+GET_ML_VECTORS_BUG_WORKAROUND_INCREMENT);
768  get_ml_vectors(NULp, node, seq_start, seq_end);
769  }
770 
771  MostLikelySeq *seq = getOrCreate_seq(node);
772 
773  for (size_t pos = 0; pos < ali_len; pos++) {
774  ST_base_vector& vec = seq->tmp_out[pos];
775  double sum = vec.summarize();
776 
777  if (sum == 0) {
778  for (i = 0; i < 4; i++) {
779  result[i][pos] = -1;
780  }
781  }
782  else {
783  double div = 100.0 / sum;
784 
785  for (i = 0; i < 4; i++) {
786  result[i][pos] = char ((vec.b[adb[i]] * div) + 0.5);
787  }
788  }
789  }
790 
791  latest_update = latest_modification;
792  }
793  return true;
794 }
795 
796 ST_ML_Color *ST_ML::get_color_string(const char *species_name, AP_tree *node, size_t start_ali_pos, size_t end_ali_pos) {
800  if (!node) {
801  // if node isn't given, search it using species name:
802  if (!hash_2_ap_tree) return NULp;
803  node = (AP_tree *) GBS_read_hash(hash_2_ap_tree, species_name);
804  if (!node) return NULp;
805  }
806 
807  // align start_ali_pos/end_ali_pos to previous/next pos divisible by ST_BUCKET_SIZE:
808  start_ali_pos &= ~(ST_BUCKET_SIZE - 1);
809  end_ali_pos = (end_ali_pos & ~(ST_BUCKET_SIZE - 1)) + ST_BUCKET_SIZE - 1;
810 
811  size_t ali_len = get_alignment_length();
812  if (end_ali_pos > ali_len) {
813  end_ali_pos = ali_len;
814  }
815 
816  double val;
817  MostLikelySeq *seq = getOrCreate_seq(node);
818  size_t pos;
819 
820  if (!seq->color_out) { // allocate mem for color_out if we not already have it
821  ARB_calloc(seq->color_out, ali_len);
823  }
824  // search for first out-dated position:
825  for (pos = start_ali_pos; pos <= end_ali_pos; pos += ST_BUCKET_SIZE) {
826  if (seq->color_out_valid_till[pos >> LD_BUCKET_SIZE] < latest_modification) break;
827  }
828  if (pos > end_ali_pos) { // all positions are up-to-date
829  return seq->color_out; // => return existing result
830  }
831 
832  for (size_t start = start_ali_pos; start <= end_ali_pos; start += GET_ML_VECTORS_BUG_WORKAROUND_INCREMENT) {
834  get_ml_vectors(NULp, node, start, end); // calculates tmp_out (see below)
835  }
836 
837  const char *source_sequence = NULp;
838  GBDATA *gb_data = seq->get_bound_species_data();
839  if (gb_data) source_sequence = GB_read_char_pntr(gb_data);
840 
841  // create color string in 'outs':
842  ST_ML_Color *outs = seq->color_out + start_ali_pos;
843  ST_base_vector *vec = seq->tmp_out + start_ali_pos; // tmp_out was calculated by get_ml_vectors above
844  const char *source = source_sequence + start_ali_pos;
845 
846  for (pos = start_ali_pos; pos <= end_ali_pos; pos++) {
847  {
848  DNA_Base b = dna_table.char_to_enum(*source); // convert seq-character to enum DNA_Base
849  *outs = 0;
850 
851  if (b != ST_UNKNOWN) {
852  ST_FLOAT max = vec->max_frequency();
853  val = max / (0.0001 + vec->b[b]); // calc ratio of max/real base-char
854 
855  if (val > 1.0) { // if real base-char is NOT the max-likely base-char
856  *outs = (int) (log(val)); // => insert color
857  }
858  }
859  }
860  outs++;
861  vec++;
862  source++;
863 
864  seq->color_out_valid_till[pos >> LD_BUCKET_SIZE] = latest_modification;
865  }
866  return seq->color_out;
867 }
868 
869 void ST_ML::create_column_statistic(AW_root *awr, const char *awarname, AW_awar *awar_default_alignment) {
870  column_stat = new ColumnStat(get_gb_main(), awr, awarname, awar_default_alignment);
871 }
872 
874  return tree_root->get_root_node();
875 }
876 
879  tree_root->get_root_node()->calcTreeInfo(info);
880  return info.leafs;
881 }
882 
883 AP_tree *ST_ML::find_node_by_name(const char *species_name) {
884  AP_tree *node = NULp;
885  if (hash_2_ap_tree) node = (AP_tree *)GBS_read_hash(hash_2_ap_tree, species_name);
886  return node;
887 }
888 
889 const AP_filter *ST_ML::get_filter() const { return tree_root->get_filter(); }
891 size_t ST_ML::get_alignment_length() const { return get_filter()->get_length(); }
892 
GBDATA * get_gb_main() const
Definition: st_ml.hxx:194
ST_ML(GBDATA *gb_main)
Definition: ST_ml.cxx:316
GB_ERROR calc_st_ml(const char *tree_name, const char *alignment_name, const char *species_names, int marked_only, ColumnStat *colstat, const WeightedFilter *weighted_filter) __ATTR__USERESULT
Definition: ST_ml.cxx:493
const char * GB_ERROR
Definition: arb_core.h:25
const AliView * get_aliview() const
Definition: ARB_Tree.hxx:75
string result
long remove_leafs(AWT_RemoveType awt_remove_type)
Definition: AP_Tree.cxx:944
void setTo(ST_FLOAT freq)
Definition: st_ml.hxx:64
size_t get_first_pos() const
Definition: st_ml.hxx:206
void clear_all()
Definition: ST_ml.cxx:664
const float * get_frequencies(unsigned char c) const
Definition: ColumnStat.hxx:93
const TreeNode * get_gbt_tree() const
Definition: ST_ml.cxx:873
long GBS_write_hash(GB_HASH *hs, const char *key, long val)
Definition: adhash.cxx:457
const ST_base_vector & get_base_frequency_at(size_t pos) const
Definition: st_ml.hxx:212
virtual void set_root()
Definition: TreeNode.cxx:206
GB_ERROR GB_add_callback(GBDATA *gbd, GB_CB_TYPE type, const DatabaseCallback &dbcb)
Definition: ad_cb.cxx:356
void increaseBy(ST_FLOAT inc)
Definition: st_ml.hxx:74
MostLikelySeq(const AliView *aliview, ST_ML *st_ml_)
Definition: ST_ml.cxx:160
void print()
Definition: ST_ml.cxx:128
unsigned char ST_ML_Color
Definition: st_ml.hxx:42
Definition: st_ml.hxx:54
DNA_Base
Definition: st_ml.hxx:49
static void st_sequence_del_callback(GBDATA *, MostLikelySeq *seq)
Definition: ST_ml.cxx:181
char * ARB_strdup(const char *str)
Definition: arb_string.h:27
ST_ML * st_ml
const char * GBS_global_string(const char *templat,...)
Definition: arb_msg.cxx:204
long GBT_get_alignment_len(GBDATA *gb_main, const char *aliname)
Definition: adali.cxx:718
static char * alignment_name
AP_tree_members gr
Definition: AP_Tree.hxx:214
const ST_base_vector * get_inv_base_frequencies() const
Definition: st_ml.hxx:211
float get_rate_at(size_t pos) const
Definition: st_ml.hxx:213
void GBS_free_hash(GB_HASH *hs)
Definition: adhash.cxx:541
void transform(const ST_base_vector &in, ST_base_vector &out) const
Definition: ST_ml.cxx:138
void print()
Definition: ST_ml.cxx:110
void multiplyWith(ST_FLOAT factor)
Definition: st_ml.hxx:69
static ST_base_vector * tmp_out
GB_ERROR GBT_link_tree(TreeNode *tree, GBDATA *gb_main, bool show_status, int *zombies, int *duplicates)
Definition: adtree.cxx:907
const int ST_BUCKET_SIZE
FILE * seq
Definition: rns.c:46
GBT_LEN leftlen
Definition: TreeNode.h:132
Definition: st_ml.hxx:52
Definition: st_ml.hxx:50
#define DOWNCAST(totype, expr)
Definition: downcast.h:141
const AP_filter * get_filter() const
Definition: ARB_Tree.hxx:77
AP_tree * find_node_by_name(const char *species_name)
Definition: ST_ml.cxx:883
size_t leafs
Definition: ARB_Tree.hxx:105
Definition: st_ml.hxx:111
virtual AP_tree * REMOVE()
Definition: AP_Tree.cxx:259
ST_FLOAT min_frequency()
Definition: st_ml.hxx:91
static HelixNrInfo * start
void set_sequence()
Definition: ST_ml.cxx:224
AP_sequence * dup() const OVERRIDE
Definition: ST_ml.cxx:213
size_t GB_read_string_count(GBDATA *gbd)
Definition: arbdb.cxx:910
GB_ERROR GB_await_error()
Definition: arb_msg.cxx:353
#define GET_ML_VECTORS_BUG_WORKAROUND_INCREMENT
Definition: ST_ml.cxx:679
GBDATA * get_bound_species_data() const
void print()
Definition: ST_ml.cxx:305
const float * get_rates() const
Definition: ColumnStat.hxx:90
size_t get_filtered_length() const
Definition: ST_ml.cxx:890
const char * get_tree_name() const
Definition: ARB_Tree.hxx:85
DNA_Base char_to_enum(char i)
const char * awarname(const char *awarname_template, int idx)
Definition: ED4_flags.cxx:37
const float * get_ttratio() const
Definition: ColumnStat.hxx:91
double get_step_size() const
Definition: st_ml.hxx:209
void calculate_ancestor(const MostLikelySeq *lefts, double leftl, const MostLikelySeq *rights, double rightl)
Definition: ST_ml.cxx:247
MostLikelySeq * get_ml_vectors(const char *species_name, AP_tree *node, size_t start_ali_pos, size_t end_ali_pos)
Definition: ST_ml.cxx:681
#define false
Definition: ureadseq.h:13
static int weights[MAX_BASETYPES][MAX_BASETYPES]
Definition: ClustalV.cxx:71
void makeInverseOf(const ST_base_vector &other, ST_FLOAT other_min_freq)
Definition: st_ml.hxx:79
size_t count_species_in_tree() const
Definition: ST_ml.cxx:877
TreeNode * father
Definition: TreeNode.h:131
static void error(const char *msg)
Definition: mkptypes.cxx:96
ST_FLOAT max_frequency()
Definition: st_ml.hxx:92
DNA_Table dna_table
Definition: ST_ml.cxx:24
AP_sequence * get_seq()
Definition: ARB_Tree.hxx:162
size_t get_length() const
Definition: AP_filter.hxx:83
__ATTR__USERESULT GB_ERROR calculate(AP_filter *filter)
Definition: ColumnStat.cxx:107
GB_ERROR loadFromDB(const char *name) FINAL_OVERRIDE
Definition: AP_Tree.cxx:891
AP_sequence * set_seq(AP_sequence *sequence)
Definition: ARB_Tree.hxx:164
TreeNode * get_brother()
Definition: TreeNode.h:394
ST_rate_matrix & get_matrix_for(int distance)
Definition: st_ml.hxx:190
ST_FLOAT summarize() const
Definition: st_ml.hxx:90
void unbind_from_species()
Definition: AP_sequence.cxx:40
GBT_LEN rightlen
Definition: TreeNode.h:132
void GBS_hash_do_loop(GB_HASH *hs, gb_hash_loop_type func, void *client_data)
Definition: adhash.cxx:548
size_t get_filtered_length() const
Definition: AP_filter.hxx:82
void check_overflow()
Definition: ST_ml.cxx:79
ST_FLOAT b[ST_MAX_BASE]
Definition: st_ml.hxx:60
ST_ML_Color * color_out
void calc_out(const MostLikelySeq *sequence_of_brother, double dist)
Definition: ST_ml.cxx:281
bool is_up_to_date() const
const AliView * get_aliview() const
Definition: AP_sequence.hxx:82
bool is_leaf() const
Definition: TreeNode.h:171
TYPE * ARB_calloc(size_t nelem)
Definition: arb_mem.h:81
AliView * create_aliview(const char *aliname, GB_ERROR &error) const
Definition: GUI_aliview.cxx:67
int * color_out_valid_till
GB_ERROR close(GB_ERROR error)
Definition: arbdbpp.cxx:32
void cleanup()
Definition: ST_ml.cxx:474
bool update_ml_likelihood(char *result[4], int &latest_update, const char *species_name, AP_tree *node)
Definition: ST_ml.cxx:727
void setBase(const ST_base_vector &frequencies, char base)
Definition: ST_ml.cxx:55
static double ttratio
void sequence_change()
Definition: ST_ml.cxx:209
const AP_filter * get_filter() const
Definition: ST_ml.cxx:889
GB_ERROR tree_size_ok(AP_tree_root *tree_root)
Definition: ST_ml.cxx:463
char * name
Definition: TreeNode.h:134
void subtitle(const char *stitle)
Definition: arb_progress.h:263
ST_FLOAT lik
Definition: st_ml.hxx:62
void forget_sequence()
void GB_remove_callback(GBDATA *gbd, GB_CB_TYPE type, const DatabaseCallback &dbcb)
Definition: ad_cb.cxx:360
GB_ERROR is_invalid() const
Definition: AP_filter.hxx:123
GB_ERROR bind_to_species(GBDATA *gb_species)
Definition: ST_ml.cxx:186
#define st_assert(cond)
Definition: st_ml.hxx:30
long GB_read_clock(GBDATA *gbd)
Definition: arbdb.cxx:1712
DNA_Table()
Definition: ST_ml.cxx:26
#define NULp
Definition: cxxforward.h:97
const int LD_BUCKET_SIZE
size_t get_last_pos() const
Definition: st_ml.hxx:207
size_t get_alignment_length() const
Definition: ST_ml.cxx:891
ST_ML_Color * get_color_string(const char *species_name, AP_tree *node, size_t start_ali_pos, size_t end_ali_pos)
Definition: ST_ml.cxx:796
Definition: st_ml.hxx:51
long GBT_get_species_count(GBDATA *gb_main)
Definition: aditem.cxx:207
GB_transaction ta(gb_var)
Definition: st_ml.hxx:53
void destroy(TreeNode *that)
Definition: TreeNode.h:559
GB_ERROR bind_to_species(GBDATA *gb_species)
Definition: AP_sequence.cxx:24
GB_CSTR GB_read_char_pntr(GBDATA *gbd)
Definition: arbdb.cxx:898
GBDATA * gb_node
Definition: TreeNode.h:133
GBDATA * gb_main
Definition: adname.cxx:33
void set(double dist, double TT_ratio)
Definition: ST_ml.cxx:120
~MostLikelySeq() OVERRIDE
Definition: ST_ml.cxx:169
const size_t ST_MAX_SEQ_PART
#define min(a, b)
Definition: f2c.h:153
static int info[maxsites+1]
void create_column_statistic(AW_root *awr, const char *awarname, AW_awar *awar_default_alignment)
Definition: ST_ml.cxx:869
uint32_t gc
Definition: AP_Tree.hxx:152
long GBS_read_hash(const GB_HASH *hs, const char *key)
Definition: adhash.cxx:395
void unbind_from_species(bool remove_callbacks)
Definition: ST_ml.cxx:197
ST_base_vector & operator*=(const ST_base_vector &other)
Definition: ST_ml.cxx:97
float ST_FLOAT
Definition: st_ml.hxx:46
~ST_ML()
Definition: ST_ml.cxx:341
GB_HASH * GBS_create_hash(long estimated_elements, GB_CASE case_sens)
Definition: adhash.cxx:253
#define UNCOVERED()
Definition: arb_assert.h:380
static void st_sequence_callback(GBDATA *, MostLikelySeq *seq)
Definition: ST_ml.cxx:177
#define max(a, b)
Definition: f2c.h:154