ARB
DI_foundclusters.cxx
Go to the documentation of this file.
1 // ================================================================ //
2 // //
3 // File : DI_foundclusters.cxx //
4 // Purpose : Store results of cluster detection //
5 // //
6 // Coded by Ralf Westram (coder@reallysoft.de) in November 2009 //
7 // Institute of Microbiology (Technical University Munich) //
8 // http://www.arb-home.de/ //
9 // //
10 // ================================================================ //
11 
12 #include "di_foundclusters.hxx"
13 #include "di_clustertree.hxx"
14 
15 #include <aw_window.hxx>
16 #include <aw_select.hxx>
17 
18 #include <cmath>
19 
20 using namespace std;
21 
22 ID Cluster::unused_id = 1;
23 
24 static AP_FLOAT calc_mean_dist_to(const ClusterTree *distance_to, const LeafRelations *distances) {
25  size_t count = 0;
26  AP_FLOAT sum = 0.0;
27 
28  LeafRelationCIter dist_end = distances->end();
29  for (LeafRelationCIter dist = distances->begin(); dist != dist_end; ++dist) {
30  const TwoLeafs& leafs = dist->first;
31 
32  if (leafs.first() == distance_to || leafs.second() == distance_to) {
33  count++;
34  sum += dist->second;
35  }
36  }
37 
38  cl_assert(count); // 'distance_to' does not occur in 'distances'
39 
40  return sum/count;
41 }
42 
43 Cluster::Cluster(ClusterTree *ct)
44  : next_desc(NULp)
45 {
46  cl_assert(ct->get_state() == CS_IS_CLUSTER);
47 
48  id = unused_id++;
49 
50  const LeafRelations *distances = ct->get_sequence_dists();
51  cl_assert(distances);
52 
53  LeafRelationCIter dist = distances->begin();
54  LeafRelationCIter dist_end = distances->end();
55 
56  cl_assert(dist != dist_end); // oops - empty cluster
57 
58  min_dist = max_dist = dist->second;
59  mean_dist = 0.0;
60 
61  AP_FLOAT min_mean_dist = 9999999999.0;
62  representative = NULp;
63 
64  for (; dist != dist_end; ++dist) {
65  const TwoLeafs& leafs = dist->first;
66  AP_FLOAT di = dist->second;
67 
68  if (di<min_dist) min_dist = di;
69  if (di>max_dist) max_dist = di;
70 
71  mean_dist += di;
72 
73  for (int i = 0; i<2; ++i) {
74  const ClusterTree *member = i ? leafs.first() : leafs.second();
75  GBDATA *gb_species = member->gb_node;
76 
77  cl_assert(gb_species);
78  if (members.find(gb_species) == members.end()) { // not in set
79  members.insert(gb_species);
80  AP_FLOAT mean_dist_to_member = calc_mean_dist_to(member, distances);
81 
82  if (mean_dist_to_member<min_mean_dist) {
83  representative = gb_species;
84  min_mean_dist = mean_dist_to_member;
85  }
86  }
87  }
88 
89  }
90 
91  cl_assert(members.size() == ct->get_leaf_count());
92 
93  min_bases = ct->get_min_bases();
94  {
95  ClusterTree *root = ct->get_root_node();
96  size_t allSize = root->get_leaf_count();
97  size_t size = ct->get_leaf_count();
98  size_t first = ct->relative_position_in(root);
99  size_t last = first+size-1;
100 
101  rel_tree_pos = ((double(first)+last)/2) / allSize;
102  }
103  mean_dist /= distances->size();
104  cl_assert(representative);
105  desc = create_description(ct);
106 }
107 
108 static string *get_downgroups(const ARB_countedTree *ct, const ARB_tree_predicate& keep_group_name, size_t& group_members) {
109  string *result = NULp;
110  if (ct->is_leaf()) {
111  group_members = 0;
112  }
113  else {
114  const char *group_name = ct->get_group_name();
115 
116  if (group_name && keep_group_name.selects(*ct)) {
117  group_members = ct->get_leaf_count();
118  result = new string(ct->name);
119  }
120  else {
121  string *leftgroups = get_downgroups(ct->get_leftson(), keep_group_name, group_members);
122  size_t right_members;
123  string *rightgroups = get_downgroups(ct->get_rightson(), keep_group_name, right_members);
124 
125  group_members += right_members;
126 
127  if (leftgroups || rightgroups) {
128  if (!leftgroups) { result = rightgroups; rightgroups = NULp; }
129  else if (!rightgroups) { result = leftgroups; leftgroups = NULp; }
130  else result = new string(*leftgroups+"+"+*rightgroups);
131 
132  delete leftgroups;
133  delete rightgroups;
134  }
135  }
136  }
137  return result;
138 }
139 static const char *get_upgroup(const ARB_countedTree *ct, const ARB_tree_predicate& keep_group_name, const ARB_countedTree*& upgroupPtr, bool reuse_original_name) {
140  const char *group_name = ct->get_group_name();
141  char *original = (reuse_original_name && group_name) ? originalGroupName(group_name) : NULp;
142 
143  if (original) {
144  static SmartCharPtr result;
145  result = original;
146  group_name = &*result;
147  upgroupPtr = ct;
148  }
149  else if (group_name && keep_group_name.selects(*ct)) {
150  upgroupPtr = ct;
151  }
152  else {
153  const ARB_countedTree *father = ct->get_father();
154  if (father) {
155  group_name = get_upgroup(father, keep_group_name, upgroupPtr, reuse_original_name);
156  }
157  else {
158  upgroupPtr = ct;
159  group_name = "WHOLE_TREE";
160  }
161  }
162 
163  cl_assert(!upgroupPtr || ct->is_inside(upgroupPtr));
164  cl_assert(group_name);
165  return group_name;
166 }
167 
168 string Cluster::create_description(const ARB_countedTree *ct) {
169  // creates cluster description (using up- and down-groups, percentages, existing groups, ...)
170  cl_assert(!ct->is_leaf());
171 
172  string name;
173 
174  UseAnyTree use_any_upgroup;
175 
176  const ARB_countedTree *upgroup = NULp;
177  const char *upgroup_name = get_upgroup(ct, use_any_upgroup, upgroup, false);
178  size_t upgroup_members = upgroup->get_leaf_count();
179 
180  size_t cluster_members = ct->get_leaf_count();
181 
182  cl_assert(upgroup_members>0); // if no group, whole tree should have been returned
183 
184  if (upgroup_members == cluster_members) { // use original or existing name
185  name = string("[G] ")+upgroup_name;
186  }
187  else {
188  size_t downgroup_members;
189  string *downgroup_names = get_downgroups(ct, use_any_upgroup, downgroup_members);
190 
191  AP_FLOAT up_rel = (AP_FLOAT)cluster_members/upgroup_members; // used for: xx% of upgroup (big is good)
192  AP_FLOAT down_rel = (AP_FLOAT)downgroup_members/cluster_members; // used for: downgroup(s) + (1-xx)% outgroup (big is good)
193 
194  if (up_rel>down_rel) { // prefer up_percent
195  name = string(GBS_global_string("%4.1f%% of %s", up_rel*100.0, upgroup_name));
196  }
197  else {
198  if (downgroup_members == cluster_members) {
199  name = downgroup_names->c_str();
200  }
201  else {
202  name = string(GBS_global_string("%s + %4.1f%% outgroup", downgroup_names->c_str(), (1-down_rel)*100.0));
203  }
204  }
205 
206  delete downgroup_names;
207  }
208  return name;
209 }
210 
211 string Cluster::get_upgroup_info(const ARB_countedTree *ct, const ARB_tree_predicate& keep_group_name) {
212  // generates a string indicating relative position in up-group
213  cl_assert(!ct->is_leaf());
214 
215  const ARB_countedTree *upgroup = NULp;
216  const char *upgroup_name = get_upgroup(ct, keep_group_name, upgroup, true);
217 
218  size_t upgroup_members = upgroup->get_leaf_count();
219  size_t cluster_members = ct->get_leaf_count();
220 
221  const char *upgroup_info;
222  if (upgroup_members == cluster_members) {
223  upgroup_info = upgroup_name;
224  }
225  else {
226  size_t cluster_pos1 = ct->relative_position_in(upgroup)+1; // range [1..upgroup_members]
227  size_t cluster_posN = cluster_pos1+cluster_members-1;
228 
229  upgroup_info = GBS_global_string("%zu-%zu/%zu_%s", cluster_pos1, cluster_posN, upgroup_members, upgroup_name);
230  }
231  return upgroup_info;
232 }
233 
234 // --------------------------
235 // DisplayFormat
236 
237 class DisplayFormat : virtual Noncopyable {
238  size_t max_count;
239  AP_FLOAT max_dist;
240  AP_FLOAT max_minBases;
241 
242  mutable char *format_string;
243 
244  static char *make_format(size_t val) {
245  int digits = calc_digits(val);
246  return GBS_global_string_copy("%%%izu", digits);
247  }
248 
249  static void calc_float_format(AP_FLOAT val, int& all, int& afterdot) {
250  int digits = calc_signed_digits(long(val));
251  afterdot = digits <=3 ? 5-digits-1 : 0;
252  cl_assert(afterdot >= 0);
253  all = digits+(afterdot ? (afterdot+1) : 0);
254  }
255 
256  static char *make_format(AP_FLOAT val) {
257  int all, afterdot;
258  calc_float_format(val, all, afterdot);
259  return GBS_global_string_copy("%%%i.%if", all, afterdot);
260  }
261 
262 public:
263 
265  : max_count(0),
266  max_dist(0.0),
267  max_minBases(0.0),
268  format_string(NULp)
269  {}
270 
272  free(format_string);
273  }
274 
275  void announce(size_t count, AP_FLOAT maxDist, AP_FLOAT minBases) {
276  max_count = std::max(max_count, count);
277  max_dist = std::max(max_dist, maxDist);
278  max_minBases = std::max(max_minBases, minBases);
279 
280  freenull(format_string);
281  }
282 
283  const char *get_header() const {
284  int countSize = calc_digits(max_count) + 2; // allow to use two of the spaces behind count
285  int distSize, baseSize;
286  {
287  int afterdot;
288  calc_float_format(max_dist, distSize, afterdot);
289  calc_float_format(max_minBases, baseSize, afterdot);
290  }
291 
292  return GBS_global_string("%-*s%*s %*s Groupname",
293  countSize, countSize<5 ? "#" : "Count",
294  distSize, "Mean [min-max] dist",
295  baseSize, "Bases");
296  }
297 
298  const char *get_format() const {
299  if (!format_string) {
300  // create format string for description
301  char *count_part = make_format(max_count);
302  char *dist_part = make_format(max_dist);
303  char *bases_part = make_format(max_minBases);
304 
305  format_string = GBS_global_string_copy("%s %s [%s-%s] %s %%s",
306  count_part,
307  dist_part, dist_part, dist_part,
308  bases_part);
309 
310  free(bases_part);
311  free(dist_part);
312  free(count_part);
313  }
314  return format_string;
315  }
316 };
317 
318 
319 
321  AP_FLOAT max_of_all_dists = std::max(max_dist, std::max(min_dist, mean_dist));
322  format.announce(get_member_count(), max_of_all_dists*100.0, min_bases);
323 }
324 
325 const char *Cluster::get_list_display(const DisplayFormat *format) const {
326  const char *format_string;
327  if (format) format_string = format->get_format();
328  else format_string = "%zu %.3f [%.3f-%.3f] %.1f %s";
329 
330  return GBS_global_string(format_string,
332  mean_dist*100.0,
333  min_dist*100.0,
334  max_dist*100.0,
335  min_bases,
336  desc.c_str());
337 }
338 
339 
340 // ---------------------
341 // ClustersData
342 
344  known_clusters[clus->get_ID()] = clus;
345  get_subset(subset).push_back(clus->get_ID());
346 }
347 
349  int pos = get_pos(clus, subset);
350  if (pos != -1) {
351  ClusterIDs& ids = get_subset(subset);
352  ClusterIDs::iterator toRemove = ids.begin();
353  advance(toRemove, pos);
354  ids.erase(toRemove);
355  known_clusters.erase(clus->get_ID());
356  }
357 }
358 
360  ClusterIDs& ids = get_subset(subset);
361  ClusterIDsIter id_end = ids.end();
362  for (ClusterIDsIter id = ids.begin(); id != id_end; ++id) {
363  known_clusters.erase(*id);
364  }
365  ids.clear();
366 }
367 
369  ClusterPtr toStore = clusterWithID(id);
370  remove(toStore, SHOWN_CLUSTERS);
371  add(toStore, STORED_CLUSTERS);
372 }
373 
375  copy(shown.begin(), shown.end(), back_insert_iterator<ClusterIDs>(stored));
376  shown.clear();
377  sort_needed = false;
378 }
379 
381  copy(stored.begin(), stored.end(), back_insert_iterator<ClusterIDs>(shown));
382  stored.clear();
383  sort_needed = true;
384 }
385 
387  swap(stored, shown);
388  sort_needed = true;
389 }
390 
391 
393  const KnownClusters& known;
394  ClusterOrder primary_order;
395  ClusterOrder secondary_order;
396 public:
397  SortClusterIDsBy(const KnownClusters& known_, ClusterOrder primary, ClusterOrder secondary) :
398  known(known_),
399  primary_order(primary),
400  secondary_order(secondary)
401  {}
402 
403  bool operator()(ID id1, ID id2) const {
404  ClusterPtr cl1 = known.find(id1)->second;
405  ClusterPtr cl2 = known.find(id2)->second;
406 
407  cl_assert(!cl1.isNull() && !cl2.isNull());
408 
409  bool less = cl1->lessByOrder(*cl2, primary_order);
410  if (!less) {
411  bool equal = !cl2->lessByOrder(*cl1, primary_order);
412  if (equal) {
413  less = cl1->lessByOrder(*cl2, secondary_order);
414  }
415  }
416  return less;
417  }
418 };
419 
422  clusterList->clear();
423 
424  if (shown.empty()) {
425  clusterList->insert_default("<No clusters detected>", ID(0));
426  }
427  else {
428  {
429  SortClusterIDsBy sortBy(known_clusters, criteria[0], criteria[1]);
430  sort(shown.begin(), shown.end(), sortBy);
431  }
432  {
433  ClusterIDsIter cl_end = shown.end();
435 
436  for (int pass = 1; pass <= 2; ++pass) {
437  if (pass == 2) {
438  clusterList->insert(format.get_header(), ID(-1));
439  }
440  for (ClusterIDsIter cl = shown.begin(); cl != cl_end; ++cl) {
441  ID id = *cl;
442  ClusterPtr cluster = clusterWithID(id);
443 
444  if (pass == 1) {
445  cluster->scan_display_widths(format);
446  }
447  else {
448  clusterList->insert(cluster->get_list_display(&format), cluster->get_ID());
449  }
450  }
451  }
452  clusterList->insert_default("<select no cluster>", ID(0));
453  }
454  }
455  clusterList->update();
456 }
457 
459  // delete calculated/stored data
460  shown.clear();
461  stored.clear();
463  known_clusters.clear();
464 }
465 
466 char *originalGroupName(const char *groupname) {
467  char *original = NULp;
468  const char *was = strstr(groupname, " {was:");
469  const char *closing = strchr(groupname, '}');
470  if (was && closing) {
471  original = ARB_strpartdup(was+6, closing-1);
472  if (original[0]) {
473  char *sub = originalGroupName(original);
474  if (sub) freeset(original, sub);
475  }
476  else {
477  freenull(original); // empty -> invalid
478  }
479  }
480  return original;
481 }
482 
483 
484 
string result
const ClusterTree * second() const
const char * id
Definition: AliAdmin.cxx:17
group_matcher all()
Definition: test_unit.h:1000
AliDataPtr format(AliDataPtr data, const size_t wanted_len, GB_ERROR &error)
Definition: insdel.cxx:615
const char * get_list_display(const DisplayFormat *format) const
char * originalGroupName(const char *groupname)
return string(buffer, length)
void insert_default(const char *displayed, const AW_scalar &value)
Definition: AW_select.cxx:385
const ClusterTree * first() const
void update_cluster_selection_list()
static const char * get_upgroup(const ARB_countedTree *ct, const ARB_tree_predicate &keep_group_name, const ARB_countedTree *&upgroupPtr, bool reuse_original_name)
const char * GBS_global_string(const char *templat,...)
Definition: arb_msg.cxx:204
STL namespace.
CONSTEXPR_INLINE int calc_signed_digits(NUM val)
Definition: arbtools.h:207
CONSTEXPR_INLINE int calc_digits(NUM val)
Definition: arbtools.h:202
bool isNull() const
test if SmartPtr is NULp
Definition: smartptr.h:248
char * ARB_strpartdup(const char *start, const char *end)
Definition: arb_string.h:51
Cluster(ClusterTree *ct)
LeafRelations::const_iterator LeafRelationCIter
double AP_FLOAT
Definition: AP_matrix.hxx:27
bool operator()(ID id1, ID id2) const
void insert(const char *displayed, const AW_scalar &value)
Definition: AW_select.cxx:380
void add(ClusterPtr clus, ClusterSubset subset)
POS_TREE1 * father
Definition: probe_tree.h:39
std::string get_upgroup_info(const ARB_countedTree *ct, const ARB_tree_predicate &keep_group_name)
size_t relative_position_in(const ARB_countedTree *upgroup) const
Definition: ARB_Tree.cxx:267
void scan_display_widths(DisplayFormat &format) const
virtual bool selects(const ARB_seqtree &tree) const =0
std::vector< ID > ClusterIDs
bool less(const copy< T > &t1, const copy< T > &t2)
Definition: test_unit.h:634
std::map< TwoLeafs, AP_FLOAT > LeafRelations
CONSTEXPR_INLINE_Cxx14 void swap(unsigned char &c1, unsigned char &c2)
Definition: ad_io_inline.h:19
int32_t ID
void store(ID id)
ID get_ID() const
SortClusterIDsBy(const KnownClusters &known_, ClusterOrder primary, ClusterOrder secondary)
virtual unsigned get_leaf_count() const =0
const char * get_format() const
void clear(ClusterSubset subset)
void remove(ClusterPtr clus, ClusterSubset subset)
std::map< ID, ClusterPtr > KnownClusters
static void copy(double **i, double **j)
Definition: trnsprob.cxx:32
bool is_leaf() const
Definition: TreeNode.h:171
ClusterSubset
static AP_FLOAT calc_mean_dist_to(const ClusterTree *distance_to, const LeafRelations *distances)
ClusterIDs::const_iterator ClusterIDsIter
bool is_inside(const TreeNode *subtree) const
Definition: TreeNode.h:198
ClusterPtr clusterWithID(ID id) const
char * name
Definition: TreeNode.h:134
#define cl_assert(cond)
#define NULp
Definition: cxxforward.h:97
size_t get_member_count() const
bool lessByOrder(const Cluster &other, ClusterOrder sortBy) const
AW_selection_list * clusterList
void announce(size_t count, AP_FLOAT maxDist, AP_FLOAT minBases)
const char * get_group_name() const
Definition: TreeNode.h:445
char * GBS_global_string_copy(const char *templat,...)
Definition: arb_msg.cxx:195
static string * get_downgroups(const ARB_countedTree *ct, const ARB_tree_predicate &keep_group_name, size_t &group_members)
const char * get_header() const
ClusterOrder
#define max(a, b)
Definition: f2c.h:154