ARB
st_ml.hxx
Go to the documentation of this file.
1 // =============================================================== //
2 // //
3 // File : st_ml.hxx //
4 // Purpose : //
5 // //
6 // Institute of Microbiology (Technical University Munich) //
7 // http://www.arb-home.de/ //
8 // //
9 // =============================================================== //
10 
11 #ifndef ST_ML_HXX
12 #define ST_ML_HXX
13 
14 #ifndef ARBDB_BASE_H
15 #include <arbdb_base.h>
16 #endif
17 #ifndef _GLIBCXX_ALGORITHM
18 #include <algorithm>
19 #endif
20 #ifndef ATTRIBUTES_H
21 #include <attributes.h>
22 #endif
23 #ifndef ARBTOOLS_H
24 #include <arbtools.h>
25 #endif
26 #ifndef CB_H
27 #include <cb.h>
28 #endif
29 
30 #define st_assert(cond) arb_assert(cond)
31 
32 class ColumnStat;
33 class MostLikelySeq;
34 
35 class AW_root;
36 class AW_awar;
37 class AW_window;
38 
39 class AP_tree;
40 class AP_tree_root;
41 class AP_filter;
43 
44 typedef unsigned char ST_ML_Color;
45 
46 typedef float ST_FLOAT;
47 // typedef double ST_FLOAT; // careful, using double has quite an impact on the memory footprint
48 
49 enum DNA_Base {
55  ST_MAX_BASE, // [can't be changed w/o recoding several parts]
57 };
58 
60  ST_FLOAT b[ST_MAX_BASE]; // acgt-
61  int ld_lik;
62  ST_FLOAT lik; // likelihood = 2^ld_lik * lik * (b[0] + b[1] + b[2] ..)
63 
64  void setTo(ST_FLOAT freq) {
65  for (int base = ST_A; base<ST_MAX_BASE; ++base) {
66  b[base] = freq;
67  }
68  }
69  void multiplyWith(ST_FLOAT factor) {
70  for (int base = ST_A; base<ST_MAX_BASE; ++base) {
71  b[base] *= factor;
72  }
73  }
74  void increaseBy(ST_FLOAT inc) {
75  for (int base = ST_A; base<ST_MAX_BASE; ++base) {
76  b[base] += inc;
77  }
78  }
79  void makeInverseOf(const ST_base_vector& other, ST_FLOAT other_min_freq) {
80  for (int base = ST_A; base<ST_MAX_BASE; ++base) {
81  b[base] = other_min_freq / other.b[base];
82  }
83  }
84 
85  void setBase(const ST_base_vector& frequencies, char base);
86 
87  void check_overflow();
88  void print();
89 
90  ST_FLOAT summarize() const { return b[ST_A] + b[ST_C] + b[ST_G] + b[ST_T] + b[ST_GAP]; }
93 
94  inline ST_base_vector& operator *= (const ST_base_vector& other);
95 };
96 
98  // symmetric matrix containing only two different values
99 
100  ST_FLOAT diag; // value for indices [0][0], [1][1]...
101  ST_FLOAT rest; // other indices
102 
103 public:
104 
105  ST_rate_matrix() : diag(0.0), rest(0.0) {}
106  void set(double dist, double TT_ratio);
107  inline void transform(const ST_base_vector& in, ST_base_vector& out) const;
108  void print();
109 };
110 
111 class ST_ML : virtual Noncopyable {
112 
113  /* Note: Because we have only limited memory we split the
114  * sequence into several ST_MAX_SEQ_PART-sized parts during stat-calculation
115  */
116 
117  char *alignment_name;
118  GB_HASH *hash_2_ap_tree; // hash table to get from name to tree_node
119  GB_HASH *keep_species_hash; // temporary hash to find ???
120  int refresh_n;
121  int *not_valid; // which columns are valid
122 
123  AP_tree_root *tree_root;
124  int latest_modification; // last mod
125  size_t first_pos; // first..
126  size_t last_pos; // .. and last alignment position of current slice
127 
128  WindowCallbackSimple postcalc_cb;
129  AW_window *cb_window;
130 
131  GBDATA *gb_main;
132 
133  ColumnStat *column_stat;
134  // if column_stat == 0 -> rates and ttratio are owned by ST_ML,
135  // otherwise they belong to column_stat
136  const float *rates; // column independent
137  const float *ttratio; // column independent
138 
139  ST_base_vector *base_frequencies; // column independent
140  ST_base_vector *inv_base_frequencies; // column independent
141 
142  double max_dist; // max_dist for rate_matrices
143  double step_size; // max_dist/step_size matrices
144 
145  int max_rate_matrices; // number of rate_matrices
146  ST_rate_matrix *rate_matrices; // one matrix for each distance
147 
148  bool is_initialized;
149 
150  size_t distance_2_matrices_index(int distance) {
151  if (distance<0) distance = -distance; // same matrix for neg/pos distance
152 
153  return distance >= max_rate_matrices
154  ? (max_rate_matrices-1)
155  : size_t(distance);
156  }
157 
158  MostLikelySeq *getOrCreate_seq(AP_tree *node);
159 
160  const MostLikelySeq *get_mostlikely_sequence(AP_tree *node);
161  void undo_tree(AP_tree *node); // opposite of get_mostlikely_sequence()
162 
163  void insert_tree_into_hash_rek(AP_tree *node);
164  void create_matrices(double max_disti, int nmatrices);
165  void create_frequencies();
166 
167  static long delete_species(const char *key, long val, void *cd_st_ml);
168 
169 public:
170 
171  ST_ML(GBDATA *gb_main);
172  ~ST_ML();
173 
174  void create_column_statistic(AW_root *awr, const char *awarname, AW_awar *awar_default_alignment);
175  ColumnStat *get_column_statistic() { return column_stat; }
176 
177  GB_ERROR calc_st_ml(const char *tree_name,
178  const char *alignment_name,
179  const char *species_names, // 0 -> all [marked] species (else species_names is a (char)1 separated list of species)
180  int marked_only,
181  ColumnStat *colstat,
182  const WeightedFilter *weighted_filter) __ATTR__USERESULT;
183 
184  void cleanup();
185 
186  const AP_filter *get_filter() const;
187  size_t get_filtered_length() const;
188  size_t get_alignment_length() const;
189 
191  return rate_matrices[distance_2_matrices_index(distance)];
192  }
193 
194  GBDATA *get_gb_main() const { return gb_main; }
195  const TreeNode *get_gbt_tree() const;
196  size_t count_species_in_tree() const;
197 
198  AP_tree *find_node_by_name(const char *species_name);
199 
200  void set_postcalc_callback(WindowCallbackSimple postcalc_cb_, AW_window *cb_window_) {
201  postcalc_cb = postcalc_cb_;
202  cb_window = cb_window_;
203  }
204  void do_postcalc_callback() { if (postcalc_cb) postcalc_cb(cb_window); }
205 
206  size_t get_first_pos() const { return first_pos; }
207  size_t get_last_pos() const { return last_pos; }
208 
209  double get_step_size() const { return step_size; }
210 
211  const ST_base_vector *get_inv_base_frequencies() const { return inv_base_frequencies; }
212  const ST_base_vector& get_base_frequency_at(size_t pos) const { return base_frequencies[pos]; }
213  float get_rate_at(size_t pos) const { return rates[pos]; }
214 
215  void set_modified(int *what = NULp);
216  void set_refresh(); // set flag for refresh
217 
218  void clear_all(); // delete all caches
219 
220  MostLikelySeq *get_ml_vectors(const char *species_name, AP_tree *node, size_t start_ali_pos, size_t end_ali_pos);
221  ST_ML_Color *get_color_string(const char *species_name, AP_tree *node, size_t start_ali_pos, size_t end_ali_pos);
222 
223  bool update_ml_likelihood(char *result[4], int& latest_update, const char *species_name, AP_tree *node);
224 
225  int refresh_needed();
226 };
227 
228 #else
229 #error st_ml.hxx included twice
230 #endif // ST_ML_HXX
GBDATA * get_gb_main() const
Definition: st_ml.hxx:194
ST_ML(GBDATA *gb_main)
Definition: ST_ml.cxx:314
GB_ERROR calc_st_ml(const char *tree_name, const char *alignment_name, const char *species_names, int marked_only, ColumnStat *colstat, const WeightedFilter *weighted_filter) __ATTR__USERESULT
Definition: ST_ml.cxx:491
const char * GB_ERROR
Definition: arb_core.h:25
string result
void setTo(ST_FLOAT freq)
Definition: st_ml.hxx:64
size_t get_first_pos() const
Definition: st_ml.hxx:206
void clear_all()
Definition: ST_ml.cxx:662
const TreeNode * get_gbt_tree() const
Definition: ST_ml.cxx:871
const ST_base_vector & get_base_frequency_at(size_t pos) const
Definition: st_ml.hxx:212
void(* WindowCallbackSimple)(AW_window *)
Definition: cb.h:29
ColumnStat * get_column_statistic()
Definition: st_ml.hxx:175
void set_modified(int *what=NULp)
void increaseBy(ST_FLOAT inc)
Definition: st_ml.hxx:74
void print()
Definition: ST_ml.cxx:128
unsigned char ST_ML_Color
Definition: st_ml.hxx:42
Definition: st_ml.hxx:54
DNA_Base
Definition: st_ml.hxx:49
const ST_base_vector * get_inv_base_frequencies() const
Definition: st_ml.hxx:211
float get_rate_at(size_t pos) const
Definition: st_ml.hxx:213
void transform(const ST_base_vector &in, ST_base_vector &out) const
Definition: ST_ml.cxx:138
void print()
Definition: ST_ml.cxx:110
void multiplyWith(ST_FLOAT factor)
Definition: st_ml.hxx:69
int refresh_needed()
Definition: st_ml.hxx:52
Definition: st_ml.hxx:50
AP_tree * find_node_by_name(const char *species_name)
Definition: ST_ml.cxx:881
Definition: st_ml.hxx:111
ST_FLOAT min_frequency()
Definition: st_ml.hxx:91
size_t get_filtered_length() const
Definition: ST_ml.cxx:888
const char * awarname(const char *awarname_template, int idx)
Definition: ED4_flags.cxx:37
double get_step_size() const
Definition: st_ml.hxx:209
MostLikelySeq * get_ml_vectors(const char *species_name, AP_tree *node, size_t start_ali_pos, size_t end_ali_pos)
Definition: ST_ml.cxx:679
void makeInverseOf(const ST_base_vector &other, ST_FLOAT other_min_freq)
Definition: st_ml.hxx:79
size_t count_species_in_tree() const
Definition: ST_ml.cxx:875
ST_FLOAT max_frequency()
Definition: st_ml.hxx:92
ST_rate_matrix & get_matrix_for(int distance)
Definition: st_ml.hxx:190
ST_FLOAT summarize() const
Definition: st_ml.hxx:90
void check_overflow()
Definition: ST_ml.cxx:79
ST_FLOAT b[ST_MAX_BASE]
Definition: st_ml.hxx:60
void set_postcalc_callback(WindowCallbackSimple postcalc_cb_, AW_window *cb_window_)
Definition: st_ml.hxx:200
void cleanup()
Definition: ST_ml.cxx:472
bool update_ml_likelihood(char *result[4], int &latest_update, const char *species_name, AP_tree *node)
Definition: ST_ml.cxx:725
void setBase(const ST_base_vector &frequencies, char base)
Definition: ST_ml.cxx:55
const AP_filter * get_filter() const
Definition: ST_ml.cxx:887
void set_refresh()
#define __ATTR__USERESULT
Definition: attributes.h:58
ST_FLOAT lik
Definition: st_ml.hxx:62
#define NULp
Definition: cxxforward.h:116
size_t get_last_pos() const
Definition: st_ml.hxx:207
size_t get_alignment_length() const
Definition: ST_ml.cxx:889
ST_ML_Color * get_color_string(const char *species_name, AP_tree *node, size_t start_ali_pos, size_t end_ali_pos)
Definition: ST_ml.cxx:794
Definition: st_ml.hxx:51
Definition: st_ml.hxx:53
void set(double dist, double TT_ratio)
Definition: ST_ml.cxx:120
#define min(a, b)
Definition: f2c.h:153
void create_column_statistic(AW_root *awr, const char *awarname, AW_awar *awar_default_alignment)
Definition: ST_ml.cxx:867
ST_base_vector & operator*=(const ST_base_vector &other)
Definition: ST_ml.cxx:97
float ST_FLOAT
Definition: st_ml.hxx:46
void do_postcalc_callback()
Definition: st_ml.hxx:204
~ST_ML()
Definition: ST_ml.cxx:339
#define max(a, b)
Definition: f2c.h:154