ARB
DI_clustertree.cxx
Go to the documentation of this file.
1 // =============================================================== //
2 // //
3 // File : DI_clustertree.cxx //
4 // Purpose : //
5 // //
6 // Coded by Ralf Westram (coder@reallysoft.de) in October 2009 //
7 // Institute of Microbiology (Technical University Munich) //
8 // http://www.arb-home.de/ //
9 // //
10 // =============================================================== //
11 
12 #include "di_clustertree.hxx"
13 #include <arb_progress.h>
14 #include <set>
15 #include <cmath>
16 #include <limits>
17 
18 using namespace std;
19 
20 typedef std::set<LeafRelation> SortedPairValues; // iterator runs from small to big values
21 
22 // ------------------------
23 // ClusterTreeRoot
24 
25 
26 ClusterTreeRoot::ClusterTreeRoot(AliView *aliview, AP_sequence *seqTemplate_, AP_FLOAT maxDistance_, size_t minClusterSize_)
27  : ARB_seqtree_root(aliview, seqTemplate_, false),
28  maxDistance(maxDistance_),
29  minClusterSize(minClusterSize_)
30 {}
31 
32 #if defined(DEBUG)
33 struct ClusterStats {
34  size_t clusters;
35  size_t subclusters;
36  size_t loadedSequences;
37 
38  ClusterStats() :
39  clusters(0),
40  subclusters(0),
41  loadedSequences(0)
42  {}
43 };
44 
45 static void collectStats(ClusterTree *node, ClusterStats *stats) {
46  switch (node->get_state()) {
47  case CS_IS_CLUSTER: stats->clusters++; break;
48  case CS_SUB_CLUSTER: stats->subclusters++; break;
49  default: break;
50  }
51  if (node->is_leaf()) {
52  stats->loadedSequences += node->hasSequence();
53  }
54  else {
55  collectStats(node->get_leftson(), stats);
56  collectStats(node->get_rightson(), stats);
57  }
58 }
59 #endif // DEBUG
60 
61 GB_ERROR ClusterTreeRoot::find_clusters() {
62  ClusterTree *root = get_root_node();
63 
64  root->init_tree();
65 #if defined(DEBUG)
66  printf("----------------------------------------\n");
67  printf("maxDistance: %f\n", maxDistance);
68  printf("minClusterSize: %u\n", minClusterSize);
69  printf("Leafs in tree: %u\n", root->get_leaf_count());
70  printf("Possible clusters: %u\n", root->get_cluster_count());
71 #endif // DEBUG
72 
73  arb_progress cluster_progress(long(root->get_cluster_count()));
74  cluster_progress.auto_subtitles("Cluster");
75 
77 
78  try {
79  root->detect_clusters(cluster_progress);
80  }
81  catch (const char *msg) { error = msg; }
82  catch (...) { cl_assert(0); }
83 
84  if (error) {
85  // avoid further access after error
86  change_root(root, NULp);
87  UNCOVERED();
88  destroy(root);
89  }
90  else {
91 #if defined(DEBUG)
92  ClusterStats stats;
93  collectStats(root, &stats);
94 
95  printf("Found clusters:::::::: %zu\n", stats.clusters);
96  printf("Found subclusters::::: %zu\n", stats.subclusters);
97  printf("Loaded sequences:::::: %zu (%5.2f%%)\n",
98  stats.loadedSequences,
99  (100.0*stats.loadedSequences)/root->get_leaf_count());
100 
101 #if defined(TRACE_DIST_CALC)
102  size_t distances = root->get_calculated_distances();
103  size_t leafs = root->get_leaf_count();
104  size_t existingDistances = (leafs*leafs)/2-1;
105 
106  printf("Calculated distances:: %zu (%5.2f%%)\n",
107  distances,
108  (100.0*distances)/existingDistances);
109 #endif // TRACE_DIST_CALC
110 
111 #endif // DEBUG
112  }
113 
114  return error;
115 }
116 
117 // --------------------
118 // ClusterTree
119 
120 void ClusterTree::init_tree() {
121  cl_assert(state == CS_UNKNOWN);
122 
123 #if defined(TRACE_DIST_CALC)
124  calculatedDistances = 0;
125 #endif // TRACE_DIST_CALC
126 
127  if (get_father()) {
128  depth = get_father()->get_depth()+1;
129  }
130  else {
131  depth = 1;
132  cl_assert(is_root_node());
133  }
134 
135  if (is_leaf()) {
136  leaf_count = 1;
137  clus_count = 0;
138  state = CS_TOO_SMALL;
139  }
140  else {
141  ClusterTree *lson = get_leftson();
142  ClusterTree *rson = get_rightson();
143 
144  lson->init_tree();
145  rson->init_tree();
146 
147  leaf_count = lson->get_leaf_count() + rson->get_leaf_count();
148 
149  ClusterTreeRoot *troot = get_tree_root();
150  if (leaf_count<troot->get_minClusterSize()) {
151  state = CS_TOO_SMALL;
152  clus_count = 0;
153  }
154  else {
155  state = CS_MAYBE_CLUSTER;
156  clus_count = lson->get_cluster_count() + rson->get_cluster_count() + 1;
157  min_bases = numeric_limits<AP_FLOAT>::infinity();
158  }
159  }
160 
161  cl_assert(state != CS_UNKNOWN);
162 }
163 
164 void ClusterTree::detect_clusters(arb_progress& progress) {
165  if (state == CS_MAYBE_CLUSTER) {
166  cl_assert(!is_leaf());
167 
168  ClusterTree *lson = get_leftson();
169  ClusterTree *rson = get_rightson();
170 
171  lson->detect_clusters(progress);
172  rson->detect_clusters(progress);
173 
174 #if defined(TRACE_DIST_CALC)
175  calculatedDistances += get_leftson()->calculatedDistances;
176  calculatedDistances += get_rightson()->calculatedDistances;
177 #endif // TRACE_DIST_CALC
178 
179  if (lson->get_state() == CS_NO_CLUSTER || rson->get_state() == CS_NO_CLUSTER) {
180  // one son is no cluster -> can't be cluster myself
181  state = CS_NO_CLUSTER;
182  }
183  else {
184  state = CS_IS_CLUSTER; // assume this is a cluster
185 
186  // Get set of branches (sorted by branch distance)
187 
188  get_branch_dists(); // calculates branchDists
189  SortedPairValues pairsByBranchDistance; // biggest branch distances at end
190  {
191  LeafRelationCIter bd_end = branchDists->end();
192  for (LeafRelationCIter bd = branchDists->begin(); bd != bd_end; ++bd) {
193  pairsByBranchDistance.insert(LeafRelation(bd->first, bd->second));
194  }
195  }
196 
197  cl_assert(pairsByBranchDistance.size() == possible_relations());
198 
199  // calculate real sequence distance
200  // * stop when distance > maxDistance is detected
201  // * calculate longest branchdistance first
202  // (Assumption is: big branch distance <=> big sequence distance)
203 
204  AP_FLOAT maxDistance = get_tree_root()->get_maxDistance();
205  AP_FLOAT worstDistanceSeen = -1.0;
206 
207  cl_assert(!sequenceDists);
208  sequenceDists = new LeafRelations;
209 
210  SortedPairValues::const_reverse_iterator pair_end = pairsByBranchDistance.rend();
211  for (SortedPairValues::const_reverse_iterator pair = pairsByBranchDistance.rbegin(); pair != pair_end; ++pair) {
212  AP_FLOAT realDist = get_seqDist(pair->get_pair());
213 
214  if (realDist>worstDistanceSeen) {
215  worstDistanceSeen = realDist;
216 
217  delete worstKnownDistance;
218  worstKnownDistance = new TwoLeafs(pair->get_pair());
219 
220  if (realDist>maxDistance) { // big distance found -> not a cluster
221  state = CS_NO_CLUSTER;
222  break;
223  }
224  }
225  }
226 
227 #if defined(TRACE_DIST_CALC)
228  calculatedDistances += sequenceDists->size();
229 #endif // TRACE_DIST_CALC
230 
231  if (state == CS_IS_CLUSTER) {
232 #if defined(DEBUG)
233  // test whether all distances have been calculated
234  cl_assert(!is_leaf());
235  size_t knownDistances = sequenceDists->size() +
236  get_leftson()->known_seqDists() +
237  get_rightson()->known_seqDists();
238 
239  cl_assert(knownDistances == possible_relations());
240 #endif // DEBUG
241 
242  get_leftson()->state = CS_SUB_CLUSTER;
243  get_rightson()->state = CS_SUB_CLUSTER;
244  }
245 
246  }
247 
248  cl_assert(state == CS_NO_CLUSTER || state == CS_IS_CLUSTER); // detection failure
249 
250  lson->oblivion(false);
251  rson->oblivion(false);
252 
253  progress.inc();
254  if (progress.aborted()) throw "aborted on userrequest";
255  }
256 
257  cl_assert(state != CS_MAYBE_CLUSTER);
258 }
259 
260 void ClusterTree::oblivion(bool forgetDistances) {
261  delete branchDepths;
262  branchDepths = NULp;
263 
264  delete branchDists;
265  branchDists = NULp;
266 
267  if (state == CS_NO_CLUSTER) {
268  if (forgetDistances) {
269  delete sequenceDists;
270  sequenceDists = NULp;
271  }
272  forgetDistances = true;
273  }
274  else {
276  }
277 
278  if (!is_leaf()) {
279  get_leftson()->oblivion(forgetDistances);
280  get_rightson()->oblivion(forgetDistances);
281  }
282 }
283 
284 void ClusterTree::calc_branch_depths() {
285  cl_assert(!branchDepths);
286  branchDepths = new NodeValues;
287  if (is_leaf()) {
288  (*branchDepths)[this] = 0.0;
289  }
290  else {
291  for (int s = 0; s<2; s++) {
292  const NodeValues *sonDepths = s ? get_leftson()->get_branch_depths() : get_rightson()->get_branch_depths();
293  AP_FLOAT lenToSon = s ? leftlen : rightlen;
294 
295  NodeValues::const_iterator sd_end = sonDepths->end();
296  for (NodeValues::const_iterator sd = sonDepths->begin(); sd != sd_end; ++sd) {
297  (*branchDepths)[sd->first] = sd->second+lenToSon;
298  }
299  }
300  }
301 }
302 
303 void ClusterTree::calc_branch_dists() {
304  cl_assert(!branchDists);
305  if (!is_leaf()) {
306  branchDists = new LeafRelations;
307 
308  for (int s = 0; s<2; s++) {
309  ClusterTree *son = s ? get_leftson() : get_rightson();
310  const LeafRelations *sonDists = son->get_branch_dists();
311 
312  cl_assert(sonDists || son->is_leaf());
313  if (sonDists) branchDists->insert(sonDists->begin(), sonDists->end());
314  }
315 
316  const NodeValues *ldepths = get_leftson()->get_branch_depths();
317  const NodeValues *rdepths = get_rightson()->get_branch_depths();
318 
319  NodeValues::const_iterator l_end = ldepths->end();
320  NodeValues::const_iterator r_end = rdepths->end();
321 
322  for (NodeValues::const_iterator ld = ldepths->begin(); ld != l_end; ++ld) {
323  AP_FLOAT llen = ld->second+leftlen;
324  for (NodeValues::const_iterator rd = rdepths->begin(); rd != r_end; ++rd) {
325  AP_FLOAT rlen = rd->second+rightlen;
326 
327  (*branchDists)[TwoLeafs(ld->first, rd->first)] = llen+rlen;
328  }
329  }
330 
331  cl_assert(branchDists->size() == possible_relations());
332  }
333 }
334 
335 AP_FLOAT ClusterTree::get_seqDist(const TwoLeafs& pair) {
336  // searches or calculates the distance between the sequences of leafs in 'pair'
337 
338  const AP_FLOAT *found = has_seqDist(pair);
339  if (!found) {
340  const ClusterTree *commonFather = pair.first()->commonFatherWith(pair.second());
341 
342  while (commonFather != this) {
343  found = commonFather->has_seqDist(pair);
344  if (found) break;
345 
346  commonFather = commonFather->get_father();
347  cl_assert(commonFather); // first commonFather was not inside this!
348  }
349  }
350 
351  if (!found) {
352  // calculate the distance
353 
354  const AP_combinableSeq *seq1 = pair.first()->get_seq();
355  const AP_combinableSeq *seq2 = pair.second()->get_seq();
356 
357  seq1->lazy_load_sequence();
358  seq2->lazy_load_sequence();
359 
360  {
361  AP_combinableSeq *ancestor = seq1->dup();
362 
363  Mutations mutations = ancestor->combine_seq(seq1, seq2);
364  AP_FLOAT wbc1 = seq1->weighted_base_count();
365  AP_FLOAT wbc2 = seq2->weighted_base_count();
366  AP_FLOAT minBaseCount = wbc1 < wbc2 ? wbc1 : wbc2;
367  AP_FLOAT dist;
368 
369  if (minBaseCount>0) {
370  dist = mutations / minBaseCount;
371  if (minBaseCount < min_bases) min_bases = minBaseCount;
372  }
373  else { // got at least one empty sequence -> no distance
374  dist = 0.0;
375  min_bases = minBaseCount;
376  }
377 
378 #if defined(DEBUG) && 0
379  const char *name1 = pair.first()->name;
380  const char *name2 = pair.second()->name;
381  printf("%s <-?-> %s: mutations=%5.2f wbc1=%5.2f wbc2=%5.2f mbc=%5.2f dist=%5.2f\n",
382  name1, name2, mutations, wbc1, wbc2, minBaseCount, dist);
383 #endif // DEBUG
384 
385  cl_assert(!is_nan_or_inf(dist));
386 
387  (*sequenceDists)[pair] = dist;
388  delete ancestor;
389  }
390 
391  found = has_seqDist(pair);
392  cl_assert(found);
393  }
394 
395  cl_assert(found);
396  return *found;
397 }
398 
399 const AP_FLOAT *ClusterTree::has_seqDist(const TwoLeafs& pair) const {
400  if (sequenceDists) {
401  LeafRelationCIter found = sequenceDists->find(pair);
402  if (found != sequenceDists->end()) {
403  return &(found->second);
404  }
405  }
406  return NULp;
407 }
408 
409 const ClusterTree *ClusterTree::commonFatherWith(const ClusterTree *other) const {
410  int depth_diff = get_depth() - other->get_depth();
411  if (depth_diff) {
412  if (depth_diff<0) { // this is less deep than other
413  return other->get_father()->commonFatherWith(this);
414  }
415  else { // other is less deep than this
416  return get_father()->commonFatherWith(other);
417  }
418  }
419  else { // both at same depth
420  if (this == other) { // common father reached ?
421  return this;
422  }
423  else { // otherwise check fathers
424  return get_father()->commonFatherWith(other->get_father());
425  }
426  }
427 }
428 
const char * GB_ERROR
Definition: arb_core.h:25
const ClusterTree * second() const
const ClusterTree * first() const
STL namespace.
LeafRelations::const_iterator LeafRelationCIter
double AP_FLOAT
Definition: AP_matrix.hxx:30
CONSTEXPR_INLINE bool is_nan_or_inf(const T &n)
Definition: arbtools.h:178
POS_TREE1 * get_father() const
Definition: probe_tree.h:49
bool aborted()
Definition: arb_progress.h:335
#define false
Definition: ureadseq.h:13
virtual AP_combinableSeq * dup() const =0
long Mutations
Definition: AP_sequence.hxx:99
static void error(const char *msg)
Definition: mkptypes.cxx:96
std::map< TwoLeafs, AP_FLOAT > LeafRelations
#define cl_assert(cond)
AP_FLOAT weighted_base_count() const
#define NULp
Definition: cxxforward.h:116
bool is_leaf() const
Definition: probe_tree.h:67
std::set< LeafRelation > SortedPairValues
void lazy_load_sequence() const
Definition: AP_sequence.hxx:67
virtual Mutations combine_seq(const AP_combinableSeq *lefts, const AP_combinableSeq *rights, char *mutation_per_site=NULp)=0
void destroy(TreeNode *that)
Definition: TreeNode.h:600
std::map< ClusterTree *, AP_FLOAT > NodeValues
#define UNCOVERED()
Definition: arb_assert.h:380
GB_write_int const char s
Definition: AW_awar.cxx:154