ARB
DI_foundclusters.cxx
Go to the documentation of this file.
1 // ================================================================ //
2 // //
3 // File : DI_foundclusters.cxx //
4 // Purpose : Store results of cluster detection //
5 // //
6 // Coded by Ralf Westram (coder@reallysoft.de) in November 2009 //
7 // Institute of Microbiology (Technical University Munich) //
8 // http://www.arb-home.de/ //
9 // //
10 // ================================================================ //
11 
12 #include "di_foundclusters.hxx"
13 #include "di_clustertree.hxx"
14 
15 #include <aw_window.hxx>
16 #include <aw_select.hxx>
17 
18 #include <cmath>
19 
20 using namespace std;
21 
22 ID Cluster::unused_id = 1;
23 
24 static AP_FLOAT calc_mean_dist_to(const ClusterTree *distance_to, const LeafRelations *distances) {
25  size_t count = 0;
26  AP_FLOAT sum = 0.0;
27 
28  LeafRelationCIter dist_end = distances->end();
29  for (LeafRelationCIter dist = distances->begin(); dist != dist_end; ++dist) {
30  const TwoLeafs& leafs = dist->first;
31 
32  if (leafs.first() == distance_to || leafs.second() == distance_to) {
33  count++;
34  sum += dist->second;
35  }
36  }
37 
38  cl_assert(count); // 'distance_to' does not occur in 'distances'
39 
40  return sum/count;
41 }
42 
43 Cluster::Cluster(ClusterTree *ct)
44  : next_desc(NULp)
45 {
46  cl_assert(ct->get_state() == CS_IS_CLUSTER);
47 
48  id = unused_id++;
49 
50  const LeafRelations *distances = ct->get_sequence_dists();
51  cl_assert(distances);
52 
53  LeafRelationCIter dist = distances->begin();
54  LeafRelationCIter dist_end = distances->end();
55 
56  cl_assert(dist != dist_end); // oops - empty cluster
57 
58  min_dist = max_dist = dist->second;
59  mean_dist = 0.0;
60 
61  AP_FLOAT min_mean_dist = 9999999999.0;
62  representative = NULp;
63 
64  for (; dist != dist_end; ++dist) {
65  const TwoLeafs& leafs = dist->first;
66  AP_FLOAT di = dist->second;
67 
68  if (di<min_dist) min_dist = di;
69  if (di>max_dist) max_dist = di;
70 
71  mean_dist += di;
72 
73  for (int i = 0; i<2; ++i) {
74  const ClusterTree *member = i ? leafs.first() : leafs.second();
75  GBDATA *gb_species = member->gb_node;
76 
77  cl_assert(gb_species);
78  if (members.find(gb_species) == members.end()) { // not in set
79  members.insert(gb_species);
80  AP_FLOAT mean_dist_to_member = calc_mean_dist_to(member, distances);
81 
82  if (mean_dist_to_member<min_mean_dist) {
83  representative = gb_species;
84  min_mean_dist = mean_dist_to_member;
85  }
86  }
87  }
88 
89  }
90 
91  cl_assert(members.size() == ct->get_leaf_count());
92 
93  min_bases = ct->get_min_bases();
94  {
95  ClusterTree *root = ct->get_root_node();
96  size_t allSize = root->get_leaf_count();
97  size_t size = ct->get_leaf_count();
98  size_t first = ct->relative_position_in(root);
99  size_t last = first+size-1;
100 
101  rel_tree_pos = ((double(first)+last)/2) / allSize;
102  }
103  mean_dist /= distances->size();
104  cl_assert(representative);
105  desc = create_description(ct);
106 }
107 
108 static string *get_downgroups(const ARB_countedTree *ct, const ARB_tree_predicate& keep_group_name, size_t& group_members) {
109  string *result = NULp;
110  if (ct->is_leaf()) {
111  group_members = 0;
112  }
113  else {
114  const char *group_name = ct->get_group_name();
115 
116  if (group_name && keep_group_name.selects(*ct)) {
117  group_members = ct->get_leaf_count();
118  if (ct->is_keeled_group()) result = new string(GBS_global_string("%c%s", KEELED_INDICATOR, group_name));
119  else result = new string(group_name);
120  }
121  else {
122  string *leftgroups = get_downgroups(ct->get_leftson(), keep_group_name, group_members);
123  size_t right_members;
124  string *rightgroups = get_downgroups(ct->get_rightson(), keep_group_name, right_members);
125 
126  group_members += right_members;
127 
128  if (leftgroups || rightgroups) {
129  if (!leftgroups) { result = rightgroups; rightgroups = NULp; }
130  else if (!rightgroups) { result = leftgroups; leftgroups = NULp; }
131  else result = new string(*leftgroups+"+"+*rightgroups);
132 
133  delete leftgroups;
134  delete rightgroups;
135  }
136  }
137  }
138  return result;
139 }
140 static const char *get_upgroup(const ARB_countedTree *ct, const ARB_tree_predicate& keep_group_name, const ARB_countedTree*& upgroupPtr, bool reuse_original_name) {
141  const char *group_name = ct->get_group_name();
142  char *original = (reuse_original_name && group_name) ? originalGroupName(group_name) : NULp;
143 
144  if (original) {
145  static SmartCharPtr result;
146  result = original;
147  group_name = &*result;
148  upgroupPtr = ct;
149  }
150  else if (group_name && keep_group_name.selects(*ct)) {
151  upgroupPtr = ct;
152  }
153  else {
154  const ARB_countedTree *father = ct->get_father();
155  if (father) {
156  group_name = get_upgroup(father, keep_group_name, upgroupPtr, reuse_original_name);
157  }
158  else {
159  upgroupPtr = ct;
160  group_name = "<WHOLE TREE>";
161  }
162  }
163 
164  cl_assert(!upgroupPtr || ct->is_inside(upgroupPtr));
165  cl_assert(group_name);
166  return group_name;
167 }
168 
169 string Cluster::create_description(const ARB_countedTree *ct) {
170  // creates cluster description (using up- and down-groups, percentages, existing groups, ...)
171  cl_assert(!ct->is_leaf());
172 
173  string name;
174 
175  UseAnyTree use_any_upgroup;
176 
177  const ARB_countedTree *upgroup = NULp;
178  const char *upgroup_name = get_upgroup(ct, use_any_upgroup, upgroup, false);
179  size_t upgroup_members = upgroup->get_leaf_count();
180 
181  size_t cluster_members = ct->get_leaf_count();
182 
183  cl_assert(upgroup_members>0); // if no group, whole tree should have been returned
184 
185  if (upgroup_members == cluster_members) { // use original or existing name
186  name = string("[G] ")+upgroup_name;
187  }
188  else {
189  size_t downgroup_members;
190  string *downgroup_names = get_downgroups(ct, use_any_upgroup, downgroup_members);
191 
192  AP_FLOAT up_rel = (AP_FLOAT)cluster_members/upgroup_members; // used for: xx% of upgroup (big is good)
193  AP_FLOAT down_rel = (AP_FLOAT)downgroup_members/cluster_members; // used for: downgroup(s) + (1-xx)% outgroup (big is good)
194 
195  if (up_rel>down_rel) { // prefer up_percent
196  name = string(GBS_global_string("%4.1f%% of %s", up_rel*100.0, upgroup_name));
197  }
198  else {
199  if (downgroup_members == cluster_members) {
200  name = downgroup_names->c_str();
201  }
202  else {
203  name = string(GBS_global_string("%s + %4.1f%% outgroup", downgroup_names->c_str(), (1-down_rel)*100.0));
204  }
205  }
206 
207  delete downgroup_names;
208  }
209  return name;
210 }
211 
212 string Cluster::get_upgroup_info(const ARB_countedTree *ct, const ARB_tree_predicate& keep_group_name, const string& separator) {
213  // generates a string indicating relative position in up-group
214  cl_assert(!ct->is_leaf());
215 
216  const ARB_countedTree *upgroup = NULp;
217  const char *upgroup_name = get_upgroup(ct, keep_group_name, upgroup, true);
218 
219  size_t upgroup_members = upgroup->get_leaf_count();
220  size_t cluster_members = ct->get_leaf_count();
221 
222  const char *upgroup_info;
223  if (upgroup_members == cluster_members) {
224  upgroup_info = upgroup_name;
225  }
226  else {
227  size_t cluster_pos1 = ct->relative_position_in(upgroup)+1; // range [1..upgroup_members]
228  size_t cluster_posN = cluster_pos1+cluster_members-1;
229 
230  upgroup_info = GBS_global_string("%zu.%zu[%zu]%s%s", cluster_pos1, cluster_posN, upgroup_members, separator.c_str(), upgroup_name);
231  }
232  return upgroup_info;
233 }
234 
235 // --------------------------
236 // DisplayFormat
237 
238 class DisplayFormat : virtual Noncopyable {
239  size_t max_count;
240  AP_FLOAT max_dist;
241  AP_FLOAT max_minBases;
242 
243  mutable char *format_string;
244 
245  static char *make_format(size_t val) {
246  int digits = calc_digits(val);
247  return GBS_global_string_copy("%%%izu", digits);
248  }
249 
250  static void calc_float_format(AP_FLOAT val, int& all, int& afterdot) {
251  int digits = calc_signed_digits(long(val));
252  afterdot = digits <=3 ? 5-digits-1 : 0;
253  cl_assert(afterdot >= 0);
254  all = digits+(afterdot ? (afterdot+1) : 0);
255  }
256 
257  static char *make_format(AP_FLOAT val) {
258  int all, afterdot;
259  calc_float_format(val, all, afterdot);
260  return GBS_global_string_copy("%%%i.%if", all, afterdot);
261  }
262 
263 public:
264 
266  : max_count(0),
267  max_dist(0.0),
268  max_minBases(0.0),
269  format_string(NULp)
270  {}
271 
273  free(format_string);
274  }
275 
276  void announce(size_t count, AP_FLOAT maxDist, AP_FLOAT minBases) {
277  max_count = std::max(max_count, count);
278  max_dist = std::max(max_dist, maxDist);
279  max_minBases = std::max(max_minBases, minBases);
280 
281  freenull(format_string);
282  }
283 
284  const char *get_header() const {
285  int countSize = calc_digits(max_count) + 2; // allow to use two of the spaces behind count
286  int distSize, baseSize;
287  {
288  int afterdot;
289  calc_float_format(max_dist, distSize, afterdot);
290  calc_float_format(max_minBases, baseSize, afterdot);
291  }
292 
293  return GBS_global_string("%-*s%*s %*s Groupname",
294  countSize, countSize<5 ? "#" : "Count",
295  distSize, "Mean [min-max] dist",
296  baseSize, "Bases");
297  }
298 
299  const char *get_format() const {
300  if (!format_string) {
301  // create format string for description
302  char *count_part = make_format(max_count);
303  char *dist_part = make_format(max_dist);
304  char *bases_part = make_format(max_minBases);
305 
306  format_string = GBS_global_string_copy("%s %s [%s-%s] %s %%s",
307  count_part,
308  dist_part, dist_part, dist_part,
309  bases_part);
310 
311  free(bases_part);
312  free(dist_part);
313  free(count_part);
314  }
315  return format_string;
316  }
317 };
318 
319 
320 
322  AP_FLOAT max_of_all_dists = std::max(max_dist, std::max(min_dist, mean_dist));
323  format.announce(get_member_count(), max_of_all_dists*100.0, min_bases);
324 }
325 
326 const char *Cluster::get_list_display(const DisplayFormat *format) const {
327  const char *format_string;
328  if (format) format_string = format->get_format();
329  else format_string = "%zu %.3f [%.3f-%.3f] %.1f %s";
330 
331  return GBS_global_string(format_string,
333  mean_dist*100.0,
334  min_dist*100.0,
335  max_dist*100.0,
336  min_bases,
337  desc.c_str());
338 }
339 
340 
341 // ---------------------
342 // ClustersData
343 
345  known_clusters[clus->get_ID()] = clus;
346  get_subset(subset).push_back(clus->get_ID());
347 }
348 
350  int pos = get_pos(clus, subset);
351  if (pos != -1) {
352  ClusterIDs& ids = get_subset(subset);
353  ClusterIDs::iterator toRemove = ids.begin();
354  advance(toRemove, pos);
355  ids.erase(toRemove);
356  known_clusters.erase(clus->get_ID());
357  }
358 }
359 
361  ClusterIDs& ids = get_subset(subset);
362  ClusterIDsIter id_end = ids.end();
363  for (ClusterIDsIter id = ids.begin(); id != id_end; ++id) {
364  known_clusters.erase(*id);
365  }
366  ids.clear();
367 }
368 
370  ClusterPtr toStore = clusterWithID(id);
371  remove(toStore, SHOWN_CLUSTERS);
372  add(toStore, STORED_CLUSTERS);
373 }
374 
376  copy(shown.begin(), shown.end(), back_insert_iterator<ClusterIDs>(stored));
377  shown.clear();
378  sort_needed = false;
379 }
380 
382  copy(stored.begin(), stored.end(), back_insert_iterator<ClusterIDs>(shown));
383  stored.clear();
384  sort_needed = true;
385 }
386 
388  swap(stored, shown);
389  sort_needed = true;
390 }
391 
392 
394  const KnownClusters& known;
395  ClusterOrder primary_order;
396  ClusterOrder secondary_order;
397 public:
398  SortClusterIDsBy(const KnownClusters& known_, ClusterOrder primary, ClusterOrder secondary) :
399  known(known_),
400  primary_order(primary),
401  secondary_order(secondary)
402  {}
403 
404  bool operator()(ID id1, ID id2) const {
405  ClusterPtr cl1 = known.find(id1)->second;
406  ClusterPtr cl2 = known.find(id2)->second;
407 
408  cl_assert(!cl1.isNull() && !cl2.isNull());
409 
410  bool less = cl1->lessByOrder(*cl2, primary_order);
411  if (!less) {
412  bool equal = !cl2->lessByOrder(*cl1, primary_order);
413  if (equal) {
414  less = cl1->lessByOrder(*cl2, secondary_order);
415  }
416  }
417  return less;
418  }
419 };
420 
423  clusterList->clear();
424 
425  if (shown.empty()) {
426  clusterList->insert_default("<No clusters detected>", ID(0));
427  }
428  else {
429  {
430  SortClusterIDsBy sortBy(known_clusters, criteria[0], criteria[1]);
431  sort(shown.begin(), shown.end(), sortBy);
432  }
433  {
434  ClusterIDsIter cl_end = shown.end();
436 
437  for (int pass = 1; pass <= 2; ++pass) {
438  if (pass == 2) {
439  clusterList->insert(format.get_header(), ID(-1));
440  }
441  for (ClusterIDsIter cl = shown.begin(); cl != cl_end; ++cl) {
442  ID id = *cl;
443  ClusterPtr cluster = clusterWithID(id);
444 
445  if (pass == 1) {
446  cluster->scan_display_widths(format);
447  }
448  else {
449  clusterList->insert(cluster->get_list_display(&format), cluster->get_ID());
450  }
451  }
452  }
453  clusterList->insert_default("<select no cluster>", ID(0));
454  }
455  }
456  clusterList->update();
457 }
458 
460  // delete calculated/stored data
461  shown.clear();
462  stored.clear();
464  known_clusters.clear();
465 }
466 
467 char *originalGroupName(const char *groupname) {
468  char *original = NULp;
469  const char *was = strstr(groupname, " {was:");
470  const char *closing = strchr(groupname, '}');
471  if (was && closing) {
472  original = ARB_strpartdup(was+6, closing-1);
473  if (original[0]) {
474  char *sub = originalGroupName(original);
475  if (sub) freeset(original, sub);
476  }
477  else {
478  freenull(original); // empty -> invalid
479  }
480  }
481  return original;
482 }
483 
484 
485 
string result
const ClusterTree * second() const
const char * id
Definition: AliAdmin.cxx:17
group_matcher all()
Definition: test_unit.h:1011
AliDataPtr format(AliDataPtr data, const size_t wanted_len, GB_ERROR &error)
Definition: insdel.cxx:615
const char * get_list_display(const DisplayFormat *format) const
char * originalGroupName(const char *groupname)
return string(buffer, length)
void insert_default(const char *displayed, const AW_scalar &value)
Definition: AW_select.cxx:385
const ClusterTree * first() const
void update_cluster_selection_list()
static const char * get_upgroup(const ARB_countedTree *ct, const ARB_tree_predicate &keep_group_name, const ARB_countedTree *&upgroupPtr, bool reuse_original_name)
const char * GBS_global_string(const char *templat,...)
Definition: arb_msg.cxx:203
STL namespace.
CONSTEXPR_INLINE int calc_signed_digits(NUM val)
Definition: arbtools.h:210
CONSTEXPR_INLINE int calc_digits(NUM val)
Definition: arbtools.h:205
bool isNull() const
test if SmartPtr is NULp
Definition: smartptr.h:248
char * ARB_strpartdup(const char *start, const char *end)
Definition: arb_string.h:51
Cluster(ClusterTree *ct)
LeafRelations::const_iterator LeafRelationCIter
double AP_FLOAT
Definition: AP_matrix.hxx:30
bool operator()(ID id1, ID id2) const
void insert(const char *displayed, const AW_scalar &value)
Definition: AW_select.cxx:380
void add(ClusterPtr clus, ClusterSubset subset)
POS_TREE1 * father
Definition: probe_tree.h:39
std::string get_upgroup_info(const ARB_countedTree *ct, const ARB_tree_predicate &keep_group_name, const std::string &separator)
size_t relative_position_in(const ARB_countedTree *upgroup) const
Definition: ARB_Tree.cxx:267
void scan_display_widths(DisplayFormat &format) const
CONSTEXPR_INLINE int digits(int parts)
virtual bool selects(const ARB_seqtree &tree) const =0
std::vector< ID > ClusterIDs
bool less(const copy< T > &t1, const copy< T > &t2)
Definition: test_unit.h:645
std::map< TwoLeafs, AP_FLOAT > LeafRelations
CONSTEXPR_INLINE_Cxx14 void swap(unsigned char &c1, unsigned char &c2)
Definition: ad_io_inline.h:19
bool is_keeled_group() const
Definition: TreeNode.h:475
int32_t ID
void store(ID id)
ID get_ID() const
SortClusterIDsBy(const KnownClusters &known_, ClusterOrder primary, ClusterOrder secondary)
virtual unsigned get_leaf_count() const =0
const char * get_format() const
void clear(ClusterSubset subset)
void remove(ClusterPtr clus, ClusterSubset subset)
std::map< ID, ClusterPtr > KnownClusters
static void copy(double **i, double **j)
Definition: trnsprob.cxx:32
bool is_leaf() const
Definition: TreeNode.h:211
ClusterSubset
static AP_FLOAT calc_mean_dist_to(const ClusterTree *distance_to, const LeafRelations *distances)
ClusterIDs::const_iterator ClusterIDsIter
bool is_inside(const TreeNode *subtree) const
Definition: TreeNode.h:238
ClusterPtr clusterWithID(ID id) const
#define cl_assert(cond)
#define KEELED_INDICATOR
Definition: TreeNode.h:168
#define NULp
Definition: cxxforward.h:114
size_t get_member_count() const
bool lessByOrder(const Cluster &other, ClusterOrder sortBy) const
AW_selection_list * clusterList
void announce(size_t count, AP_FLOAT maxDist, AP_FLOAT minBases)
const char * get_group_name() const
Definition: TreeNode.h:486
char * GBS_global_string_copy(const char *templat,...)
Definition: arb_msg.cxx:194
static string * get_downgroups(const ARB_countedTree *ct, const ARB_tree_predicate &keep_group_name, size_t &group_members)
const char * get_header() const
ClusterOrder
#define max(a, b)
Definition: f2c.h:154