ARB
ST_ml.cxx
Go to the documentation of this file.
1 // =============================================================== //
2 // //
3 // File : ST_ml.cxx //
4 // Purpose : //
5 // //
6 // Institute of Microbiology (Technical University Munich) //
7 // http://www.arb-home.de/ //
8 // //
9 // =============================================================== //
10 
11 #include "st_ml.hxx"
12 #include "MostLikelySeq.hxx"
13 
14 #include <ColumnStat.hxx>
15 #include <AP_filter.hxx>
16 #include <AP_Tree.hxx>
17 #include <arb_progress.h>
18 #include <gui_aliview.hxx>
19 #include <ad_cb.h>
20 
21 #include <cctype>
22 #include <cmath>
23 
25 
27  int i;
28  for (i = 0; i < 256; i++) {
29  switch (toupper(i)) {
30  case 'A':
31  char_to_enum_table[i] = ST_A;
32  break;
33  case 'C':
34  char_to_enum_table[i] = ST_C;
35  break;
36  case 'G':
37  char_to_enum_table[i] = ST_G;
38  break;
39  case 'T':
40  case 'U':
41  char_to_enum_table[i] = ST_T;
42  break;
43  case '-':
44  char_to_enum_table[i] = ST_GAP;
45  break;
46  default:
47  char_to_enum_table[i] = ST_UNKNOWN;
48  }
49  }
50 }
51 
52 // -----------------------
53 // ST_base_vector
54 
55 void ST_base_vector::setBase(const ST_base_vector& inv_frequencies, char base) {
56  base = toupper(base);
57 
58  memset((char *) &b[0], 0, sizeof(b));
59  DNA_Base ub = dna_table.char_to_enum(base);
60 
61  if (ub != ST_UNKNOWN) {
62  b[ub] = 1.0; // ill. access ?
63  }
64  else {
65  const double k = 1.0 / ST_MAX_BASE;
66  b[ST_A] = k;
67  b[ST_C] = k;
68  b[ST_G] = k;
69  b[ST_T] = k;
70  b[ST_GAP] = k;
71  }
72  for (int i = 0; i < ST_MAX_BASE; i++) { // LOOP_VECTORIZED[!<5.0]
73  b[i] *= inv_frequencies.b[i];
74  }
75  ld_lik = 0; // ? why not 1.0 ?
76  lik = 1.0;
77 }
78 
80  ST_FLOAT sum = summarize();
81 
82  if (sum < .00001) { // what happend no data, extremely unlikely
83  setTo(0.25); // strange! shouldn't this be 1.0/ST_MAX_BASE ?
84  ld_lik -= 5; // ???
85  }
86  else {
87  while (sum < 0.25) {
88  sum *= 4;
89  ld_lik -= 2;
90  multiplyWith(4);
91  }
92  }
93 
94  if (ld_lik> 10000) printf("overflow\n");
95 }
96 
98  b[ST_A] *= other.b[ST_A];
99  b[ST_C] *= other.b[ST_C];
100  b[ST_G] *= other.b[ST_G];
101  b[ST_T] *= other.b[ST_T];
102  b[ST_GAP] *= other.b[ST_GAP];
103 
104  ld_lik += other.ld_lik; // @@@ correct to use 'plus' here ? why ?
105  lik *= other.lik;
106 
107  return *this;
108 }
109 
111  int i;
112  for (i = 0; i < ST_MAX_BASE; i++) {
113  printf("%.3G ", b[i]);
114  }
115 }
116 
117 // -----------------------
118 // ST_rate_matrix
119 
120 void ST_rate_matrix::set(double dist, double /* TT_ratio */) {
121  const double k = 1.0 / ST_MAX_BASE;
122  ST_FLOAT exp_dist = exp(-dist);
123 
124  diag = k + (1.0 - k) * exp_dist;
125  rest = k - k * exp_dist;
126 }
127 
129  for (int i = 0; i < ST_MAX_BASE; i++) {
130  for (int j = 0; j < ST_MAX_BASE; j++) {
131  printf("%.3G ", i == j ? diag : rest);
132  }
133  printf("\n");
134  }
135 }
136 
137 
138 inline void ST_rate_matrix::transform(const ST_base_vector& in, ST_base_vector& out) const {
139  // optimized matrix/vector multiplication
140  // original version: http://bugs.arb-home.de/browser/trunk/STAT/ST_ml.cxx?rev=6403#L155
141 
142  ST_FLOAT sum = in.summarize();
143  ST_FLOAT diag_rest_diff = diag-rest;
144  ST_FLOAT sum_rest_prod = sum*rest;
145 
146  out.b[ST_A] = in.b[ST_A]*diag_rest_diff + sum_rest_prod;
147  out.b[ST_C] = in.b[ST_C]*diag_rest_diff + sum_rest_prod;
148  out.b[ST_G] = in.b[ST_G]*diag_rest_diff + sum_rest_prod;
149  out.b[ST_T] = in.b[ST_T]*diag_rest_diff + sum_rest_prod;
150  out.b[ST_GAP] = in.b[ST_GAP]*diag_rest_diff + sum_rest_prod;
151 
152  out.ld_lik = in.ld_lik;
153  out.lik = in.lik;
154 }
155 
156 
157 // -----------------------
158 // MostLikelySeq
159 
160 MostLikelySeq::MostLikelySeq(const AliView *aliview, ST_ML *st_ml_) :
161  AP_sequence(aliview),
162  st_ml(st_ml_),
163  sequence(new ST_base_vector[ST_MAX_SEQ_PART]),
164  up_to_date(false),
165  color_out(NULp),
166  color_out_valid_till(NULp)
167 {}
168 
170  delete [] sequence;
171  free(color_out);
172  free(color_out_valid_till);
173 
174  unbind_from_species(true);
175 }
176 
178  seq->sequence_change();
179 }
180 
182  seq->unbind_from_species(false);
183 }
184 
185 
188  if (!error) {
189  GBDATA *gb_seq = get_bound_species_data();
190  st_assert(gb_seq);
191 
192  error = GB_add_callback(gb_seq, GB_CB_CHANGED, makeDatabaseCallback(st_sequence_callback, this));
193  if (!error) error = GB_add_callback(gb_seq, GB_CB_DELETE, makeDatabaseCallback(st_sequence_del_callback, this));
194  }
195  return error;
196 }
197 void MostLikelySeq::unbind_from_species(bool remove_callbacks) {
198  GBDATA *gb_seq = get_bound_species_data();
199 
200  if (gb_seq) {
201  if (remove_callbacks) {
202  GB_remove_callback(gb_seq, GB_CB_CHANGED, makeDatabaseCallback(st_sequence_callback, this));
203  GB_remove_callback(gb_seq, GB_CB_DELETE, makeDatabaseCallback(st_sequence_del_callback, this));
204  }
206  }
207 }
208 
210  st_ml->clear_all();
211 }
212 
214  return new MostLikelySeq(get_aliview(), st_ml);
215 }
216 
217 void MostLikelySeq::set(const char *) {
218  st_assert(0); // hmm why not perform set_sequence() here ?
219 }
220 
221 void MostLikelySeq::unset() {
222 }
223 
229  GBDATA *gb_data = get_bound_species_data();
230  st_assert(gb_data);
231 
232  size_t source_sequence_len = (size_t)GB_read_string_count(gb_data);
233  const char *source_sequence = GB_read_char_pntr(gb_data) + st_ml->get_first_pos();
234  ST_base_vector *dest = sequence;
235  const ST_base_vector *freq = st_ml->get_inv_base_frequencies() + st_ml->get_first_pos();
236 
237  size_t range_len = st_ml->get_last_pos() - st_ml->get_first_pos();
238  size_t data_len = std::min(range_len, source_sequence_len);
239  size_t pos = 0;
240 
241  for (; pos<data_len; ++pos) dest[pos].setBase(freq[pos], toupper(source_sequence[pos]));
242  for (; pos<range_len; ++pos) dest[pos].setBase(freq[pos], '.');
243 
244  up_to_date = true;
245 }
246 
247 void MostLikelySeq::calculate_ancestor(const MostLikelySeq *lefts, double leftl, const MostLikelySeq *rights, double rightl) {
248  st_assert(!up_to_date);
249 
250  ST_base_vector hbv;
251  double lc = leftl / st_ml->get_step_size();
252  double rc = rightl / st_ml->get_step_size();
253  const ST_base_vector *lb = lefts->sequence;
254  const ST_base_vector *rb = rights->sequence;
255  ST_base_vector *dest = sequence;
256 
257  for (size_t pos = st_ml->get_first_pos(); pos < st_ml->get_last_pos(); pos++) {
258  st_assert(lb->lik == 1 && rb->lik == 1);
259 
260  int distl = (int) (st_ml->get_rate_at(pos) * lc);
261  int distr = (int) (st_ml->get_rate_at(pos) * rc);
262 
263  st_ml->get_matrix_for(distl).transform(*lb, *dest);
264  st_ml->get_matrix_for(distr).transform(*rb, hbv);
265 
266  *dest *= hbv;
267  dest->check_overflow();
268 
269  st_assert(dest->lik == 1);
270 
271  dest++;
272  lb++;
273  rb++;
274  }
275 
276  up_to_date = true;
277 }
278 
280 
281 void MostLikelySeq::calc_out(const MostLikelySeq *next_branch, double dist) {
282  // result will be in tmp_out
283 
284  ST_base_vector *out = tmp_out + st_ml->get_first_pos();
285  double lc = dist / st_ml->get_step_size();
286  ST_base_vector *lefts = next_branch->sequence;
287 
288  for (size_t pos = st_ml->get_first_pos(); pos < st_ml->get_last_pos(); pos++) {
289  int distl = (int) (st_ml->get_rate_at(pos) * lc);
290  st_ml->get_matrix_for(distl).transform(*lefts, *out);
291 
292  // correct frequencies
293  // @@@ check if st_ml->get_base_frequency_at(pos).lik is 1 - if so, use vec-mult here
294  for (int i = ST_A; i < ST_MAX_BASE; i++) {
295  out->b[i] *= st_ml->get_base_frequency_at(pos).b[i];
296  }
297 
298  lefts++;
299  out++;
300  }
301 }
302 
304  const char *data = GB_read_char_pntr(get_bound_species_data());
305  for (size_t i = 0; i < ST_MAX_SEQ_PART; i++) {
306  printf("POS %3zu %c ", i, data[i]);
307  printf("\n");
308  }
309 }
310 
311 // --------------
312 // ST_ML
313 
314 ST_ML::ST_ML(GBDATA *gb_maini) :
316  hash_2_ap_tree(NULp),
317  keep_species_hash(NULp),
318  refresh_n(0),
319  not_valid(NULp),
320  tree_root(NULp),
321  latest_modification(0),
322  first_pos(0),
323  last_pos(0),
324  postcalc_cb(NULp),
325  cb_window(NULp),
326  gb_main(gb_maini),
327  column_stat(NULp),
328  rates(NULp),
329  ttratio(NULp),
330  base_frequencies(NULp),
331  inv_base_frequencies(NULp),
332  max_dist(0.0),
333  step_size(0.0),
334  max_rate_matrices(0),
335  rate_matrices(NULp),
336  is_initialized(false)
337 {}
338 
340  delete tree_root;
341  free(alignment_name);
342  if (hash_2_ap_tree) GBS_free_hash(hash_2_ap_tree);
343  delete not_valid;
344  delete [] base_frequencies;
345  delete [] inv_base_frequencies;
346  delete [] rate_matrices;
347  if (!column_stat) {
348  // rates and ttratio have been allocated (see ST_ML::calc_st_ml)
349  delete [] rates;
350  delete [] ttratio;
351  }
352 }
353 
354 
355 void ST_ML::create_frequencies() {
357 
358  size_t filtered_length = get_filtered_length();
359  base_frequencies = new ST_base_vector[filtered_length];
360  inv_base_frequencies = new ST_base_vector[filtered_length];
361 
362  if (!column_stat) {
363  for (size_t i = 0; i < filtered_length; i++) {
364  base_frequencies[i].setTo(1.0);
365  base_frequencies[i].lik = 1.0;
366 
367  inv_base_frequencies[i].setTo(1.0);
368  inv_base_frequencies[i].lik = 1.0;
369  }
370  }
371  else {
372  for (size_t i = 0; i < filtered_length; i++) {
373  const ST_FLOAT NO_FREQ = 0.01;
374  ST_base_vector& base_freq = base_frequencies[i];
375 
376  base_freq.setTo(NO_FREQ);
377 
378  static struct {
379  unsigned char c;
380  DNA_Base b;
381  } toCount[] = {
382  { 'A', ST_A }, { 'a', ST_A },
383  { 'C', ST_C }, { 'c', ST_C },
384  { 'G', ST_G }, { 'g', ST_G },
385  { 'T', ST_T }, { 't', ST_T },
386  { 'U', ST_T }, { 'u', ST_T },
387  { '-', ST_GAP },
388  { 0, ST_UNKNOWN },
389  };
390 
391  for (int j = 0; toCount[j].c; ++j) {
392  const float *freq = column_stat->get_frequencies(toCount[j].c);
393  if (freq) base_freq.b[toCount[j].b] += freq[i];
394  }
395 
396  ST_FLOAT sum = base_freq.summarize();
397  ST_FLOAT smooth = sum*0.01; // smooth by %1 to avoid "crazy values"
398  base_freq.increaseBy(smooth);
399 
400  sum += smooth*ST_MAX_BASE; // correct sum
401 
402  ST_FLOAT min = base_freq.min_frequency();
403 
404  // @@@ if min == 0.0 all inv_base_frequencies will be set to inf ? correct ?
405  // maybe min should be better calculated after next if-else-clause ?
406 
407  if (sum>NO_FREQ) {
408  base_freq.multiplyWith(ST_MAX_BASE/sum);
409  }
410  else {
411  base_freq.setTo(1.0); // columns w/o data
412  }
413 
414  base_freq.lik = 1.0;
415 
416  inv_base_frequencies[i].makeInverseOf(base_freq, min);
417  inv_base_frequencies[i].lik = 1.0;
418  }
419  }
420 }
421 
422 void ST_ML::insert_tree_into_hash_rek(AP_tree *node) {
423  node->gr.gc = 0;
424  if (node->is_leaf()) {
425  GBS_write_hash(hash_2_ap_tree, node->name, (long) node);
426  }
427  else {
428  insert_tree_into_hash_rek(node->get_leftson());
429  insert_tree_into_hash_rek(node->get_rightson());
430  }
431 }
432 
433 void ST_ML::create_matrices(double max_disti, int nmatrices) {
434  delete [] rate_matrices;
435  rate_matrices = new ST_rate_matrix[nmatrices]; // LOOP_VECTORIZED
436 
437  max_dist = max_disti;
438  max_rate_matrices = nmatrices;
439  step_size = max_dist / max_rate_matrices;
440 
441  for (int i = 0; i < max_rate_matrices; i++) {
442  rate_matrices[i].set((i + 1) * step_size, 0); // ttratio[i]
443  }
444 }
445 
446 long ST_ML::delete_species(const char *key, long val, void *cd_st_ml) {
447  ST_ML *st_ml = (ST_ML*)cd_st_ml;
448 
449  if (GBS_read_hash(st_ml->keep_species_hash, key)) {
450  return val;
451  }
452  else {
453  AP_tree *leaf = (AP_tree *)val;
454  UNCOVERED();
455  destroy(leaf->REMOVE());
456 
457  return 0;
458  }
459 }
460 
461 inline GB_ERROR tree_size_ok(AP_tree_root *tree_root) {
462  GB_ERROR error = NULp;
463 
464  AP_tree *root = tree_root->get_root_node();
465  if (!root || root->is_leaf()) {
466  const char *tree_name = tree_root->get_tree_name();
467  error = GBS_global_string("Too few species remained in tree '%s'", tree_name);
468  }
469  return error;
470 }
471 
473  freenull(alignment_name);
474 
475  if (MostLikelySeq::tmp_out) {
476  delete MostLikelySeq::tmp_out;
477  MostLikelySeq::tmp_out = NULp;
478  }
479 
480  delete tree_root;
481  tree_root = NULp;
482 
483  if (hash_2_ap_tree) {
484  GBS_free_hash(hash_2_ap_tree);
485  hash_2_ap_tree = NULp;
486  }
487 
488  is_initialized = false;
489 }
490 
491 GB_ERROR ST_ML::calc_st_ml(const char *tree_name, const char *alignment_namei,
492  const char *species_names, int marked_only,
493  ColumnStat *colstat, const WeightedFilter *weighted_filter)
494 {
495  // acts as contructor, leaks as hell when called twice
496 
497  GB_ERROR error = NULp;
498 
499  if (is_initialized) cleanup();
500 
501  {
502  GB_transaction ta(gb_main);
503  arb_progress progress("Activating column statistic");
504 
505  column_stat = colstat;
506  GB_ERROR column_stat_error = column_stat->calculate(NULp);
507 
508  if (column_stat_error) fprintf(stderr, "Column statistic error: %s (using equal rates/tt-ratio for all columns)\n", column_stat_error);
509 
510  alignment_name = ARB_strdup(alignment_namei);
511  long ali_len = GBT_get_alignment_len(gb_main, alignment_name);
512 
513  if (ali_len<=0) {
514  error = GB_await_error();
515  }
516  else if (ali_len<10) {
517  error = "alignment too short";
518  }
519  else {
520  {
521  AliView *aliview = NULp;
522  if (weighted_filter) {
523  aliview = weighted_filter->create_aliview(alignment_name, error);
524  }
525  else {
526  AP_filter filter(ali_len); // unfiltered
527 
528  error = filter.is_invalid();
529  if (!error) {
530  AP_weights weights(&filter);
531  aliview = new AliView(gb_main, filter, weights, alignment_name);
532  }
533  }
534 
535  st_assert(contradicted(aliview, error));
536 
537  if (!error) {
538  MostLikelySeq *seq_templ = new MostLikelySeq(aliview, this); // @@@ error: never freed! (should be freed when freeing tree_root!)
539  tree_root = new AP_tree_root(aliview, seq_templ, false, NULp);
540  // do not delete 'aliview' or 'seq_templ' (they belong to 'tree_root' now)
541  }
542  }
543 
544  if (!error) {
545  tree_root->loadFromDB(tree_name); // tree is not linked!
546 
547  if (!tree_root->get_root_node()) { // no tree
548  error = GBS_global_string("Failed to load tree '%s'", tree_name);
549  }
550  else {
551  {
552  size_t species_in_tree = count_species_in_tree();
553  hash_2_ap_tree = GBS_create_hash(species_in_tree, GB_MIND_CASE);
554  }
555 
556  // delete species from tree:
557  if (species_names) { // keep names
558  tree_root->remove_leafs(AWT_REMOVE_ZOMBIES);
559 
560  error = tree_size_ok(tree_root);
561  if (!error) {
562  char *l, *n;
563  keep_species_hash = GBS_create_hash(GBT_get_species_count(gb_main), GB_MIND_CASE);
564  for (l = (char *) species_names; l; l = n) {
565  n = strchr(l, 1);
566  if (n) *n = 0;
567  GBS_write_hash(keep_species_hash, l, 1);
568  if (n) *(n++) = 1;
569  }
570 
571  insert_tree_into_hash_rek(tree_root->get_root_node());
572  GBS_hash_do_loop(hash_2_ap_tree, delete_species, this);
573  GBS_free_hash(keep_species_hash);
574  keep_species_hash = NULp;
575  GBT_link_tree(tree_root->get_root_node(), gb_main, true, NULp, NULp);
576  }
577  }
578  else { // keep marked/all
579  GBT_link_tree(tree_root->get_root_node(), gb_main, true, NULp, NULp);
580  tree_root->remove_leafs(marked_only ? AWT_KEEP_MARKED : AWT_REMOVE_ZOMBIES);
581 
582  error = tree_size_ok(tree_root);
583  if (!error) insert_tree_into_hash_rek(tree_root->get_root_node());
584  }
585  }
586  }
587 
588  if (!error) {
589  // calc frequencies
590 
591  progress.subtitle("calculating frequencies");
592 
593  size_t filtered_length = get_filtered_length();
594  if (!column_stat_error) {
595  rates = column_stat->get_rates();
596  ttratio = column_stat->get_ttratio();
597  }
598  else {
599  float *alloc_rates = new float[filtered_length];
600  float *alloc_ttratio = new float[filtered_length];
601 
602  for (size_t i = 0; i < filtered_length; i++) { // LOOP_VECTORIZED
603  alloc_rates[i] = 1.0;
604  alloc_ttratio[i] = 2.0;
605  }
606  rates = alloc_rates;
607  ttratio = alloc_ttratio;
608 
609  column_stat = NULp; // mark rates and ttratio as "allocated" (see ST_ML::~ST_ML)
610  }
611  create_frequencies();
612  latest_modification = GB_read_clock(gb_main); // set update time
613  create_matrices(2.0, 1000);
614 
615  MostLikelySeq::tmp_out = new ST_base_vector[filtered_length]; // @@@ error: never freed!
616  is_initialized = true;
617  }
618  }
619 
620  if (error) {
621  cleanup();
622  error = ta.close(error);
623  }
624  }
625  return error;
626 }
627 
628 MostLikelySeq *ST_ML::getOrCreate_seq(AP_tree *node) {
630  if (!seq) {
631  seq = new MostLikelySeq(tree_root->get_aliview(), this); // @@@ why not use dup() ?
632 
633  node->set_seq(seq);
634  if (node->is_leaf()) {
635  st_assert(node->gb_node);
636  seq->bind_to_species(node->gb_node);
637  }
638  }
639  return seq;
640 }
641 
642 const MostLikelySeq *ST_ML::get_mostlikely_sequence(AP_tree *node) {
646  MostLikelySeq *seq = getOrCreate_seq(node);
647  if (!seq->is_up_to_date()) {
648  if (node->is_leaf()) {
649  seq->set_sequence();
650  }
651  else {
652  const MostLikelySeq *leftSeq = get_mostlikely_sequence(node->get_leftson());
653  const MostLikelySeq *rightSeq = get_mostlikely_sequence(node->get_rightson());
654 
655  seq->calculate_ancestor(leftSeq, node->leftlen, rightSeq, node->rightlen);
656  }
657  }
658 
659  return seq;
660 }
661 
663  GB_transaction ta(gb_main);
664  undo_tree(tree_root->get_root_node());
665  latest_modification = GB_read_clock(gb_main);
666 }
667 
668 void ST_ML::undo_tree(AP_tree *node) {
669  MostLikelySeq *seq = getOrCreate_seq(node);
670  seq->forget_sequence();
671  if (!node->is_leaf()) {
672  undo_tree(node->get_leftson());
673  undo_tree(node->get_rightson());
674  }
675 }
676 
677 #define GET_ML_VECTORS_BUG_WORKAROUND_INCREMENT (ST_MAX_SEQ_PART-1) // workaround bug in get_ml_vectors
678 
679 MostLikelySeq *ST_ML::get_ml_vectors(const char *species_name, AP_tree *node, size_t start_ali_pos, size_t end_ali_pos) {
680  /* result will be in tmp_out
681  *
682  * assert end_ali_pos - start_ali_pos < ST_MAX_SEQ_PART
683  *
684  * @@@ CAUTION!!! get_ml_vectors has a bug:
685  * it does not calculate the last value, if (end_ali_pos-start_ali_pos+1)==ST_MAX_SEQ_PART
686  * (search for GET_ML_VECTORS_BUG_WORKAROUND_INCREMENT)
687  *
688  * I'm not sure whether this is really a bug! Maybe it's only some misunderstanding about
689  * 'end_ali_pos', because it does not mark the last calculated position, but the position
690  * behind the last calculated position! @@@ Need to rename it!
691  *
692  */
693 
694  if (!node) {
695  if (!hash_2_ap_tree) return NULp;
696  node = (AP_tree *) GBS_read_hash(hash_2_ap_tree, species_name);
697  if (!node) return NULp;
698  }
699 
700  st_assert(start_ali_pos<end_ali_pos);
701  st_assert((end_ali_pos - start_ali_pos + 1) <= ST_MAX_SEQ_PART);
702 
703  MostLikelySeq *seq = getOrCreate_seq(node);
704 
705  if (start_ali_pos != first_pos || end_ali_pos > last_pos) {
706  undo_tree(tree_root->get_root_node()); // undo everything
707  first_pos = start_ali_pos;
708  last_pos = end_ali_pos;
709  }
710 
711  AP_tree *pntr;
712  for (pntr = node->get_father(); pntr; pntr = pntr->get_father()) {
713  MostLikelySeq *sequ = getOrCreate_seq(pntr);
714  if (sequ) sequ->forget_sequence();
715  }
716 
717  node->set_root();
718 
719  const MostLikelySeq *seq_of_brother = get_mostlikely_sequence(node->get_brother());
720 
721  seq->calc_out(seq_of_brother, node->father->leftlen + node->father->rightlen);
722  return seq;
723 }
724 
725 bool ST_ML::update_ml_likelihood(char *result[4], int& latest_update, const char *species_name, AP_tree *node) {
739  st_assert(contradicted(species_name, node));
740 
741  if (latest_update < latest_modification) {
742  if (!node) { // if node isn't given search it using species name
743  st_assert(hash_2_ap_tree); // ST_ML was not prepared for search-by-name
744  if (hash_2_ap_tree) node = (AP_tree *) GBS_read_hash(hash_2_ap_tree, species_name);
745  if (!node) return false;
746  }
747 
748  DNA_Base adb[4];
749  int i;
750 
751  size_t ali_len = get_alignment_length();
752  st_assert(get_filtered_length() == ali_len); // assume column stat was calculated w/o filters
753 
754  if (!result[0]) { // allocate Array-elements for result
755  for (i = 0; i < 4; i++) {
756  ARB_calloc(result[i], ali_len+1); // [0 .. alignment_len[ + zerobyte
757  }
758  }
759 
760  for (i = 0; i < 4; i++) {
761  adb[i] = dna_table.char_to_enum("ACGU"[i]);
762  }
763 
764  for (size_t seq_start = 0; seq_start < ali_len; seq_start += GET_ML_VECTORS_BUG_WORKAROUND_INCREMENT) {
765  size_t seq_end = std::min(ali_len, seq_start+GET_ML_VECTORS_BUG_WORKAROUND_INCREMENT);
766  get_ml_vectors(NULp, node, seq_start, seq_end);
767  }
768 
769  MostLikelySeq *seq = getOrCreate_seq(node);
770 
771  for (size_t pos = 0; pos < ali_len; pos++) {
772  ST_base_vector& vec = seq->tmp_out[pos];
773  double sum = vec.summarize();
774 
775  if (sum == 0) {
776  for (i = 0; i < 4; i++) {
777  result[i][pos] = -1;
778  }
779  }
780  else {
781  double div = 100.0 / sum;
782 
783  for (i = 0; i < 4; i++) {
784  result[i][pos] = char ((vec.b[adb[i]] * div) + 0.5);
785  }
786  }
787  }
788 
789  latest_update = latest_modification;
790  }
791  return true;
792 }
793 
794 ST_ML_Color *ST_ML::get_color_string(const char *species_name, AP_tree *node, size_t start_ali_pos, size_t end_ali_pos) {
798  if (!node) {
799  // if node isn't given, search it using species name:
800  if (!hash_2_ap_tree) return NULp;
801  node = (AP_tree *) GBS_read_hash(hash_2_ap_tree, species_name);
802  if (!node) return NULp;
803  }
804 
805  // align start_ali_pos/end_ali_pos to previous/next pos divisible by ST_BUCKET_SIZE:
806  start_ali_pos &= ~(ST_BUCKET_SIZE - 1);
807  end_ali_pos = (end_ali_pos & ~(ST_BUCKET_SIZE - 1)) + ST_BUCKET_SIZE - 1;
808 
809  size_t ali_len = get_alignment_length();
810  if (end_ali_pos > ali_len) {
811  end_ali_pos = ali_len;
812  }
813 
814  double val;
815  MostLikelySeq *seq = getOrCreate_seq(node);
816  size_t pos;
817 
818  if (!seq->color_out) { // allocate mem for color_out if we not already have it
819  ARB_calloc(seq->color_out, ali_len);
821  }
822  // search for first out-dated position:
823  for (pos = start_ali_pos; pos <= end_ali_pos; pos += ST_BUCKET_SIZE) {
824  if (seq->color_out_valid_till[pos >> LD_BUCKET_SIZE] < latest_modification) break;
825  }
826  if (pos > end_ali_pos) { // all positions are up-to-date
827  return seq->color_out; // => return existing result
828  }
829 
830  for (size_t start = start_ali_pos; start <= end_ali_pos; start += GET_ML_VECTORS_BUG_WORKAROUND_INCREMENT) {
832  get_ml_vectors(NULp, node, start, end); // calculates tmp_out (see below)
833  }
834 
835  const char *source_sequence = NULp;
836  GBDATA *gb_data = seq->get_bound_species_data();
837  if (gb_data) source_sequence = GB_read_char_pntr(gb_data);
838 
839  // create color string in 'outs':
840  ST_ML_Color *outs = seq->color_out + start_ali_pos;
841  ST_base_vector *vec = seq->tmp_out + start_ali_pos; // tmp_out was calculated by get_ml_vectors above
842  const char *source = source_sequence + start_ali_pos;
843 
844  for (pos = start_ali_pos; pos <= end_ali_pos; pos++) {
845  {
846  DNA_Base b = dna_table.char_to_enum(*source); // convert seq-character to enum DNA_Base
847  *outs = 0;
848 
849  if (b != ST_UNKNOWN) {
850  ST_FLOAT max = vec->max_frequency();
851  val = max / (0.0001 + vec->b[b]); // calc ratio of max/real base-char
852 
853  if (val > 1.0) { // if real base-char is NOT the max-likely base-char
854  *outs = (int) (log(val)); // => insert color
855  }
856  }
857  }
858  outs++;
859  vec++;
860  source++;
861 
862  seq->color_out_valid_till[pos >> LD_BUCKET_SIZE] = latest_modification;
863  }
864  return seq->color_out;
865 }
866 
867 void ST_ML::create_column_statistic(AW_root *awr, const char *awarname, AW_awar *awar_default_alignment) {
868  column_stat = new ColumnStat(get_gb_main(), awr, awarname, awar_default_alignment);
869 }
870 
872  return tree_root->get_root_node();
873 }
874 
877  tree_root->get_root_node()->calcTreeInfo(info);
878  return info.leafs;
879 }
880 
881 AP_tree *ST_ML::find_node_by_name(const char *species_name) {
882  AP_tree *node = NULp;
883  if (hash_2_ap_tree) node = (AP_tree *)GBS_read_hash(hash_2_ap_tree, species_name);
884  return node;
885 }
886 
887 const AP_filter *ST_ML::get_filter() const { return tree_root->get_filter(); }
889 size_t ST_ML::get_alignment_length() const { return get_filter()->get_length(); }
890 
GBDATA * get_gb_main() const
Definition: st_ml.hxx:194
ST_ML(GBDATA *gb_main)
Definition: ST_ml.cxx:314
GB_ERROR calc_st_ml(const char *tree_name, const char *alignment_name, const char *species_names, int marked_only, ColumnStat *colstat, const WeightedFilter *weighted_filter) __ATTR__USERESULT
Definition: ST_ml.cxx:491
const char * GB_ERROR
Definition: arb_core.h:25
const AliView * get_aliview() const
Definition: ARB_Tree.hxx:75
string result
long remove_leafs(AWT_RemoveType awt_remove_type)
Definition: AP_Tree.cxx:944
void setTo(ST_FLOAT freq)
Definition: st_ml.hxx:64
size_t get_first_pos() const
Definition: st_ml.hxx:206
void clear_all()
Definition: ST_ml.cxx:662
const float * get_frequencies(unsigned char c) const
Definition: ColumnStat.hxx:93
const TreeNode * get_gbt_tree() const
Definition: ST_ml.cxx:871
long GBS_write_hash(GB_HASH *hs, const char *key, long val)
Definition: adhash.cxx:454
const ST_base_vector & get_base_frequency_at(size_t pos) const
Definition: st_ml.hxx:212
virtual void set_root()
Definition: TreeNode.cxx:206
GB_ERROR GB_add_callback(GBDATA *gbd, GB_CB_TYPE type, const DatabaseCallback &dbcb)
Definition: ad_cb.cxx:356
void increaseBy(ST_FLOAT inc)
Definition: st_ml.hxx:74
MostLikelySeq(const AliView *aliview, ST_ML *st_ml_)
Definition: ST_ml.cxx:160
void print()
Definition: ST_ml.cxx:128
unsigned char ST_ML_Color
Definition: st_ml.hxx:42
Definition: st_ml.hxx:54
DNA_Base
Definition: st_ml.hxx:49
static void st_sequence_del_callback(GBDATA *, MostLikelySeq *seq)
Definition: ST_ml.cxx:181
char * ARB_strdup(const char *str)
Definition: arb_string.h:27
ST_ML * st_ml
const char * GBS_global_string(const char *templat,...)
Definition: arb_msg.cxx:203
long GBT_get_alignment_len(GBDATA *gb_main, const char *aliname)
Definition: adali.cxx:833
static char * alignment_name
AP_tree_members gr
Definition: AP_Tree.hxx:214
const ST_base_vector * get_inv_base_frequencies() const
Definition: st_ml.hxx:211
float get_rate_at(size_t pos) const
Definition: st_ml.hxx:213
void GBS_free_hash(GB_HASH *hs)
Definition: adhash.cxx:538
void transform(const ST_base_vector &in, ST_base_vector &out) const
Definition: ST_ml.cxx:138
void print()
Definition: ST_ml.cxx:110
void multiplyWith(ST_FLOAT factor)
Definition: st_ml.hxx:69
static ST_base_vector * tmp_out
GB_ERROR GBT_link_tree(TreeNode *tree, GBDATA *gb_main, bool show_status, int *zombies, int *duplicates)
Definition: adtree.cxx:953
const int ST_BUCKET_SIZE
FILE * seq
Definition: rns.c:46
GBT_LEN leftlen
Definition: TreeNode.h:172
Definition: st_ml.hxx:52
Definition: st_ml.hxx:50
#define DOWNCAST(totype, expr)
Definition: downcast.h:141
const AP_filter * get_filter() const
Definition: ARB_Tree.hxx:77
AP_tree * find_node_by_name(const char *species_name)
Definition: ST_ml.cxx:881
size_t leafs
Definition: ARB_Tree.hxx:105
Definition: st_ml.hxx:111
virtual AP_tree * REMOVE()
Definition: AP_Tree.cxx:259
ST_FLOAT min_frequency()
Definition: st_ml.hxx:91
static HelixNrInfo * start
void set_sequence()
Definition: ST_ml.cxx:224
AP_sequence * dup() const OVERRIDE
Definition: ST_ml.cxx:213
size_t GB_read_string_count(GBDATA *gbd)
Definition: arbdb.cxx:916
GB_ERROR GB_await_error()
Definition: arb_msg.cxx:342
#define GET_ML_VECTORS_BUG_WORKAROUND_INCREMENT
Definition: ST_ml.cxx:677
GBDATA * get_bound_species_data() const
void print()
Definition: ST_ml.cxx:303
const float * get_rates() const
Definition: ColumnStat.hxx:90
size_t get_filtered_length() const
Definition: ST_ml.cxx:888
const char * get_tree_name() const
Definition: ARB_Tree.hxx:85
DNA_Base char_to_enum(char i)
const char * awarname(const char *awarname_template, int idx)
Definition: ED4_flags.cxx:37
const float * get_ttratio() const
Definition: ColumnStat.hxx:91
double get_step_size() const
Definition: st_ml.hxx:209
void calculate_ancestor(const MostLikelySeq *lefts, double leftl, const MostLikelySeq *rights, double rightl)
Definition: ST_ml.cxx:247
MostLikelySeq * get_ml_vectors(const char *species_name, AP_tree *node, size_t start_ali_pos, size_t end_ali_pos)
Definition: ST_ml.cxx:679
#define false
Definition: ureadseq.h:13
static int weights[MAX_BASETYPES][MAX_BASETYPES]
Definition: ClustalV.cxx:71
void makeInverseOf(const ST_base_vector &other, ST_FLOAT other_min_freq)
Definition: st_ml.hxx:79
size_t count_species_in_tree() const
Definition: ST_ml.cxx:875
TreeNode * father
Definition: TreeNode.h:171
static void error(const char *msg)
Definition: mkptypes.cxx:96
ST_FLOAT max_frequency()
Definition: st_ml.hxx:92
DNA_Table dna_table
Definition: ST_ml.cxx:24
AP_sequence * get_seq()
Definition: ARB_Tree.hxx:162
size_t get_length() const
Definition: AP_filter.hxx:83
__ATTR__USERESULT GB_ERROR calculate(AP_filter *filter)
Definition: ColumnStat.cxx:107
GB_ERROR loadFromDB(const char *name) FINAL_OVERRIDE
Definition: AP_Tree.cxx:891
AP_sequence * set_seq(AP_sequence *sequence)
Definition: ARB_Tree.hxx:164
TreeNode * get_brother()
Definition: TreeNode.h:435
ST_rate_matrix & get_matrix_for(int distance)
Definition: st_ml.hxx:190
ST_FLOAT summarize() const
Definition: st_ml.hxx:90
void unbind_from_species()
Definition: AP_sequence.cxx:40
GBT_LEN rightlen
Definition: TreeNode.h:172
void GBS_hash_do_loop(GB_HASH *hs, gb_hash_loop_type func, void *client_data)
Definition: adhash.cxx:545
size_t get_filtered_length() const
Definition: AP_filter.hxx:82
void check_overflow()
Definition: ST_ml.cxx:79
ST_FLOAT b[ST_MAX_BASE]
Definition: st_ml.hxx:60
ST_ML_Color * color_out
void calc_out(const MostLikelySeq *sequence_of_brother, double dist)
Definition: ST_ml.cxx:281
bool is_up_to_date() const
const AliView * get_aliview() const
Definition: AP_sequence.hxx:82
bool is_leaf() const
Definition: TreeNode.h:211
TYPE * ARB_calloc(size_t nelem)
Definition: arb_mem.h:81
AliView * create_aliview(const char *aliname, GB_ERROR &error) const
Definition: GUI_aliview.cxx:66
int * color_out_valid_till
GB_ERROR close(GB_ERROR error)
Definition: arbdbpp.cxx:35
void cleanup()
Definition: ST_ml.cxx:472
bool update_ml_likelihood(char *result[4], int &latest_update, const char *species_name, AP_tree *node)
Definition: ST_ml.cxx:725
void setBase(const ST_base_vector &frequencies, char base)
Definition: ST_ml.cxx:55
static double ttratio
void sequence_change()
Definition: ST_ml.cxx:209
const AP_filter * get_filter() const
Definition: ST_ml.cxx:887
GB_ERROR tree_size_ok(AP_tree_root *tree_root)
Definition: ST_ml.cxx:461
char * name
Definition: TreeNode.h:174
void subtitle(const char *stitle)
Definition: arb_progress.h:321
ST_FLOAT lik
Definition: st_ml.hxx:62
void forget_sequence()
void GB_remove_callback(GBDATA *gbd, GB_CB_TYPE type, const DatabaseCallback &dbcb)
Definition: ad_cb.cxx:360
GB_ERROR is_invalid() const
Definition: AP_filter.hxx:123
GB_ERROR bind_to_species(GBDATA *gb_species)
Definition: ST_ml.cxx:186
#define st_assert(cond)
Definition: st_ml.hxx:30
long GB_read_clock(GBDATA *gbd)
Definition: arbdb.cxx:1714
DNA_Table()
Definition: ST_ml.cxx:26
#define NULp
Definition: cxxforward.h:116
const int LD_BUCKET_SIZE
size_t get_last_pos() const
Definition: st_ml.hxx:207
size_t get_alignment_length() const
Definition: ST_ml.cxx:889
ST_ML_Color * get_color_string(const char *species_name, AP_tree *node, size_t start_ali_pos, size_t end_ali_pos)
Definition: ST_ml.cxx:794
Definition: st_ml.hxx:51
long GBT_get_species_count(GBDATA *gb_main)
Definition: aditem.cxx:207
GB_transaction ta(gb_var)
Definition: st_ml.hxx:53
void destroy(TreeNode *that)
Definition: TreeNode.h:600
GB_ERROR bind_to_species(GBDATA *gb_species)
Definition: AP_sequence.cxx:24
GB_CSTR GB_read_char_pntr(GBDATA *gbd)
Definition: arbdb.cxx:904
GBDATA * gb_node
Definition: TreeNode.h:173
GBDATA * gb_main
Definition: adname.cxx:32
void set(double dist, double TT_ratio)
Definition: ST_ml.cxx:120
~MostLikelySeq() OVERRIDE
Definition: ST_ml.cxx:169
const size_t ST_MAX_SEQ_PART
#define min(a, b)
Definition: f2c.h:153
static int info[maxsites+1]
void create_column_statistic(AW_root *awr, const char *awarname, AW_awar *awar_default_alignment)
Definition: ST_ml.cxx:867
uint32_t gc
Definition: AP_Tree.hxx:152
long GBS_read_hash(const GB_HASH *hs, const char *key)
Definition: adhash.cxx:392
void unbind_from_species(bool remove_callbacks)
Definition: ST_ml.cxx:197
ST_base_vector & operator*=(const ST_base_vector &other)
Definition: ST_ml.cxx:97
float ST_FLOAT
Definition: st_ml.hxx:46
~ST_ML()
Definition: ST_ml.cxx:339
GB_HASH * GBS_create_hash(long estimated_elements, GB_CASE case_sens)
Definition: adhash.cxx:253
#define UNCOVERED()
Definition: arb_assert.h:380
static void st_sequence_callback(GBDATA *, MostLikelySeq *seq)
Definition: ST_ml.cxx:177
#define max(a, b)
Definition: f2c.h:154