ARB
NT_tree_cmp.cxx
Go to the documentation of this file.
1 // =============================================================== //
2 // //
3 // File : NT_tree_cmp.cxx //
4 // Purpose : //
5 // //
6 // Institute of Microbiology (Technical University Munich) //
7 // http://www.arb-home.de/ //
8 // //
9 // =============================================================== //
10 
11 #include "NT_species_set.h"
12 #include <aw_msg.hxx>
13 #include <gb_aci.h>
14 #include <gb_aci_impl.h>
15 #include <arb_progress.h>
16 #include <arb_defs.h>
17 #include <string>
18 #include <climits>
19 
20 using namespace std;
21 
22 SpecSetRegistry::SpecSetRegistry(long nspecies_, arb_progress *progress_, const GroupMatchScorer& scorer_) :
23  species_counter(0),
24  nspecies(nspecies_),
25  nsets(0),
26  sets(ARB_calloc<RSpecSet*>(max_nsets())),
27  scorer(scorer_),
28  progress(progress_),
29  species_hash(GBS_create_hash(nspecies, GB_IGNORE_CASE))
30 {
31  for (int i=0; i<256; i++) {
32  int j = i;
33  int count = 0;
34  while (j) { // count bits
35  if (j&1) count ++;
36  j = j>>1;
37  }
38  set_bits[i] = count;
39  }
40  tmp_bitstring = allocate_bitstring();
41 }
42 
44  for (int i=0; i<nsets; i++) delete sets[i];
45  free(tmp_bitstring);
46  free(sets);
47  GBS_free_hash(species_hash);
48 }
49 
50 void SpecSetRegistry::add(const char *species_name) {
51  if (GBS_read_hash(species_hash, species_name)) {
52  aw_message(GBS_global_string("Warning: Species '%s' is found more than once in tree", species_name));
53  }
54  else {
55  GBS_write_hash(species_hash, species_name, ++species_counter);
56  nt_assert(species_counter<=nspecies); // more species added than planned
57  nt_assert(species_counter<=nspecies);
58  }
59 }
60 
61 void SpecSetRegistry::add(RSpecSet *rset) {
62  nt_assert(rset->size()>1); // do NOT register leafs (only inner nodes allowed)
63  nt_assert(nsets<max_nsets());
64  sets[nsets++] = rset;
65 }
66 
67 void SpecSetRegistry::dump_bitstring(const char *tag, unsigned char *bs) {
68  fprintf(stderr, "%s: ", tag);
69  const long bbytes = bitstring_bytes();
70  for (long i = 0; i<bbytes; ++i) {
71  int val = bs[i];
72  for (int b = 0; b<8; ++b) {
73  fputc(val&128 ? 'X' : '-', stderr);
74  val<<=1;
75  }
76  }
77  fputc('\n', stderr);
78 }
79 
80 GroupPenalty GroupMatchScorer::calcPenalty(long removed, long added, long commonSpecies) {
81  // - can be calculated from removed/added/commonSpecies
82  // - shall be reported (result param?)
83 
84  if (commonSpecies) {
85  long oldGroupSize = removed+commonSpecies;
86  long newGroupSize = added+commonSpecies;
87  double ingroupRatio = 1.0 - removed/double(oldGroupSize); // = percent of old members in new group
88  double outgroupRatio = added/double(newGroupSize); // = percent of non-group-members in new group
89 
90  if (insideLimits(ingroupRatio, outgroupRatio)) {
91  double penalty =
92  // abs. penalty:
93  removed*ingroupPep +
94  added*outgroupPep +
95  // rel. penalty:
96  (1-ingroupRatio)*ingroupInvRelPen +
97  outgroupRatio*outgroupRelPen;
98 
99  return GroupPenalty(penalty, ingroupRatio, outgroupRatio, oldGroupSize);
100  }
101  }
102  return GroupPenalty::NoMatch();
103 }
104 
105 GroupPenalty GroupMatchScorer::matchGroups(const TSpecSet& sourceSet, const RSpecSet& targetSet, long commonSpecies, long overallSpecies) {
114  long removed = sourceSet.get_known_members() - commonSpecies;
115  long added = targetSet.get_known_members() - commonSpecies;
116 
117  GroupPenalty match = calcPenalty(removed, added, commonSpecies);
118  nt_assert(implicated(match.doesMatch(), (removed+added)<overallSpecies));
119 
120  long targetSize_keeled = overallSpecies - targetSet.get_known_members();
121  long commonSpecies_keeled = sourceSet.get_known_members() - commonSpecies;
122  long removed_keeled = commonSpecies;
123  long added_keeled = targetSize_keeled - commonSpecies_keeled;
124 
125  GroupPenalty match_keeled = calcPenalty(removed_keeled, added_keeled, commonSpecies_keeled);
126  match_keeled.mark_as_keeled();
127  if (match_keeled.doesMatch()) { // do not add to NoMatch!
128  match_keeled.addPenalty(keelPenalty);
129  }
130  nt_assert(implicated(match_keeled.doesMatch(), (removed_keeled+added_keeled)<overallSpecies));
131 
132  if (match_keeled.betterThan(match)) {
133  // @@@ if this happens, the match semantic is inverted: if moving a group -> group should be keeled (by caller)!
134  // related to #512 + #451
135  return match_keeled;
136  }
137 
138  return match;
139 }
140 
142  return sourceSet.get_unknown_members() * unfoundPep; // penalty for unknown members in source tree
143 }
144 
146  if (ingroupPep == 0.0 && ingroupInvRelPen == 0.0) {
147  return "one ingroup penalty has to be different from zero";
148  }
149  if (outgroupPep == 0.0 && outgroupRelPen == 0.0) {
150  return "one outgroup penalty has to be different from zero";
151  }
152 
153  if (ingroupPep<0.0 || outgroupPep<0.0 || ingroupInvRelPen<0.0 || outgroupRelPen<0.0) {
154  return "invalid negative in/outgroup penalty";
155  }
156 
157  if (!ingroup.isValid() || !outgroup.isValid()) {
158  return "invalid limits";
159  }
160 
161  return NULp;
162 }
163 
165  // returns best match for 'tset' (against all registered sets).
166  // NULp if no match found.
167  // sets 'min_penalty' to penalty found for returned set.
168 
169  RSpecSet *bset = NULp;
170  min_penalty = GroupPenalty::NoMatch();
171 
172  const long bbytes = bitstring_bytes();
173  const long blongs = bitstring_longs();
174 
175  unsigned char * const tbs = tset->bitstring;
176  long * const tbl = (long*)tbs;
177  long * const tmp_bl = (long*)tmp_bitstring;
178 
179  for (long i=nsets-1; i>=0; i--) {
180  long same = 0; // amount of known members present in both sets
181 
182  RSpecSet *rset = sets[i];
183 
184  unsigned char * const rbs = rset->bitstring;
185  long * const rbl = (long*)rbs;
186 
187  for (long j = 0; j<blongs; ++j) { // LOOP_VECTORIZED
188  tmp_bl[j] = tbl[j] & rbl[j];
189  }
190  for (long j = 0; j<bbytes; ++j) { // (Note: failed to optimize: gcc wont vectorize double-indexed access)
191  same += set_bits[tmp_bitstring[j]];
192  }
193 
194  GroupPenalty penalty = scorer.matchGroups(*tset, *rset, same, nspecies);
195  if (penalty.betterThan(min_penalty)) {
196  min_penalty = penalty;
197  bset = rset;
198  }
199  progress->inc();
200  }
201 
202  return bset;
203 }
204 
205 void GroupPenalty::registerUnfound(const GroupMatchScorer& scorer, const TSpecSet& tset) {
207  nt_assert(unknown == -1); // registerUnfound shall be called exactly once!
208  unknown = tset.get_unknown_members();
209 }
210 
211 double SpecSetRegistry::search_and_remember_best_match_and_log_errors(const TSpecSet *tset, FILE *log) {
224  GroupPenalty match;
225  RSpecSet *bset = search_best_match(tset, match);
226 
227  if (!bset) {
228  // no good match found.
229  // atm this happens for small groups (e.g. with 2 species) + a special placement of these species in tree.
230  // in the future 'finding no match' will get normal behavior.
231  return -1;
232  }
233 
234  AP_tree *earlierFoundGroup = bset->matchedNode(); // group previously considered for same destination node
235  AP_tree *currentGroup = tset->set_node;
236 
237  match.registerUnfound(scorer, *tset);
238  if (match.betterThan(bset->bestMatch())) {
239  nt_assert(!bset->bestMatch().isPerfectMatch()); // shall never supersede optimally placed group!
240  if (earlierFoundGroup && log) {
241  nt_assert(earlierFoundGroup->name && currentGroup->name);
242  fprintf(log, "Group '%s' skipped (got superseded by group '%s'; same best target positions)\n",
243  earlierFoundGroup->name,
244  currentGroup->name);
245  }
246 
247  bset->storeBetterMatch(match, currentGroup);
248  }
249  else {
250  nt_assert(!match.isPerfectMatch()); // found two groups with optimal placement. should be impossible by design! can be caused by invalid penalties
251  if (log) {
252  nt_assert(earlierFoundGroup->name && currentGroup->name);
253  fprintf(log, "Group '%s' skipped (does NOT supersede group '%s'; same best target positions)\n",
254  currentGroup->name,
255  earlierFoundGroup->name);
256  }
257  }
258 
259  return match.get_penalty();
260 }
261 
262 void SpecSet::init(AP_tree *nodei, const SpecSetRegistry& ssr) {
264  known_members = 0;
265  set_node = nodei;
266 }
267 
268 SpecSet::SpecSet(AP_tree *nodei, const SpecSetRegistry& ssr, const char *species_name) {
269  init(nodei, ssr);
270 
271  long species_index = ssr.get_species_index(species_name);
272  if (species_index) {
273  nt_assert(species_index>0);
274  --species_index; // [1..N] -> [0..N-1]
275  nt_assert((species_index/8) < ssr.bitstring_bytes());
276  bitstring[species_index/8] |= 1 << (species_index % 8);
277  known_members = 1;
278  }
279 }
280 
281 SpecSet::SpecSet(AP_tree *nodei, const SpecSetRegistry& ssr, const SpecSet *l, const SpecSet *r) {
282  init(nodei, ssr);
283 
284  const long *lbs = (const long *)l->bitstring;
285  const long *rbs = (const long *)r->bitstring;
286  long *dbs = (long *)bitstring;
287 
288  for (long j = ssr.bitstring_longs()-1; j>=0; j--) { // LOOP_VECTORIZED=2
289  dbs[j] = lbs[j] | rbs[j];
290  }
292 }
293 
295  free(bitstring);
296 }
297 
298 RSpecSet::RSpecSet(AP_tree *nodei, const SpecSetRegistry& ssr, const char *species_name) :
299  SpecSet(nodei, ssr, species_name),
300  best_node(NULp)
301 {}
302 RSpecSet::RSpecSet(AP_tree *nodei, const SpecSetRegistry& ssr, const RSpecSet *l, const RSpecSet *r) :
303  SpecSet(nodei, ssr, l, r),
304  best_node(NULp)
305 {}
306 TSpecSet::TSpecSet(AP_tree *nodei, const SpecSetRegistry& ssr, const char *species_name) :
307  SpecSet(nodei, ssr, species_name),
308  unfound_species_count(1-known_members)
309 {
310 }
311 TSpecSet::TSpecSet(AP_tree *nodei, const SpecSetRegistry& ssr, const TSpecSet *l, const TSpecSet *r) :
312  SpecSet(nodei, ssr, l, r),
313  unfound_species_count(l->unfound_species_count + r->unfound_species_count)
314 {
315 }
316 
318  RSpecSet *ss;
319  // Warning: confusing resource handling:
320  // - leafs are returned "NOT owned by anybody"
321  // - inner nodes are added to and owned by this->sets
322 
323  if (node->is_leaf()) {
324  this->add(node->name);
325  ss = new RSpecSet(node, *this, node->name);
326  nt_assert(ss->is_leaf_set());
327  }
328  else {
329  RSpecSet *ls = registerTree(node->get_leftson());
330  RSpecSet *rs = registerTree(node->get_rightson());
331 
332  ss = new RSpecSet(node, *this, ls, rs);
333  this->add(ss);
334  nt_assert(!ss->is_leaf_set());
335 
336  if (ls->is_leaf_set()) delete ls;
337  if (rs->is_leaf_set()) delete rs;
338  }
339  return ss;
340 }
341 
342 TSpecSet *SpecSetRegistry::find_best_matches_info(AP_tree *node, FILE *log, bool compare_node_info) {
343  /* Go through all nodes of subtree 'node' (part of the source tree if copying info)
344  * and search for the best matching registered node (in this->sets; which belong to dest tree if copying info).
345  *
346  * [Note: If comparing topologies, source- and dest-trees were exchanged by caller.]
347  *
348  * Named groups (if copying info) resp. all nodes (if comparing topologies)
349  * get compared versus all registered nodes and the best matching node is detected.
350  * The compared node is stored inside the best matching node.
351  */
352 
353  nt_assert(contradicted(log, compare_node_info));
354 
355  TSpecSet *ss = NULp;
356  if (node->is_leaf()) {
357  ss = new TSpecSet(node, *this, node->name);
358  }
359  else {
360  TSpecSet *ls = find_best_matches_info(node->get_leftson(), log, compare_node_info);
361  TSpecSet *rs = ls ? find_best_matches_info(node->get_rightson(), log, compare_node_info) : NULp;
362 
363  if (rs) {
364  ss = new TSpecSet(node, *this, ls, rs);
365  if (compare_node_info) {
366  nt_assert(!log); // does not work / not allowed
367 
368  double penalty = search_and_remember_best_match_and_log_errors(ss, log);
369  int ipenalty = int(penalty);
370 
371  // the #-sign is important (otherwise TREE_write_Newick will not work correctly; interference with bootstrap values; refer to #787 + #767)
372  if (ipenalty>0) node->use_as_remark(GBS_global_string_copy("# %i", ipenalty));
373  else if (ipenalty<0) node->set_remark("# no match");
374  else node->remove_remark();
375  }
376  else {
377  if (node->name) {
378  double penalty = search_and_remember_best_match_and_log_errors(ss, log);
379  if (penalty<0) {
380  nt_assert(log);
381  fprintf(log, "Group '%s' doesn't fit to any destination subtree.\n", node->name);
382  }
383  }
384  }
385  }
386  delete rs;
387  delete ls;
388  }
389 
390  if (ss && progress->aborted()) {
391  delete ss;
392  ss = NULp; // abort recursion after user-abort
393  }
394  return ss;
395 }
396 
397 // -----------------------------
398 // local ACI extension
399 
401  RefPtr<const char> name;
402  const GroupPenalty& match;
403 
404  int oldGroupSize;
405  int newGroupSize;
406 
407 public:
408  GroupXfer_callenv(const GBL_env& env_, const char *name_, const GroupPenalty& match_, int oldGroupSize_, int newGroupSize_) :
409  GBL_call_env(NULp, env_),
410  name(name_),
411  match(match_),
412  oldGroupSize(oldGroupSize_),
413  newGroupSize(newGroupSize_)
414  {}
415 
416  const char *get_group_name() const { return name; }
417  const GroupPenalty& get_match() const { return match; }
418  int get_old_group_size() const { return oldGroupSize; }
419  int get_new_group_size() const { return newGroupSize; }
420 };
421 
423 inline const GroupPenalty& custom_match(GBL_command_arguments *args) { return custom_env(args).get_match(); }
424 
425 using namespace GBL_IMPL;
426 
427 #define IMPL_FORMATVALUE_CMD(args,fmt,value) \
428  COMMAND_DROPS_INPUT_STREAMS(args); \
429  GB_ERROR error = check_no_parameter(args); \
430  if (!error) FORMAT_2_OUT(args, fmt, value); \
431  return error
432 
433 static GB_ERROR gxl_groupname(GBL_command_arguments *args) { IMPL_FORMATVALUE_CMD(args, "%s", custom_env(args).get_group_name()); }
434 static GB_ERROR gxl_penalty(GBL_command_arguments *args) { IMPL_FORMATVALUE_CMD(args, "%f", custom_match(args).get_penalty()); }
435 static GB_ERROR gxl_ingroup(GBL_command_arguments *args) { IMPL_FORMATVALUE_CMD(args, "%.1f%%", custom_match(args).get_ingroup_ratio()*100.0); }
436 static GB_ERROR gxl_outgroup(GBL_command_arguments *args) { IMPL_FORMATVALUE_CMD(args, "%.1f%%", custom_match(args).get_outgroup_ratio()*100.0); }
437 static GB_ERROR gxl_oldsize(GBL_command_arguments *args) { IMPL_FORMATVALUE_CMD(args, "%i", custom_env(args).get_old_group_size()); }
438 static GB_ERROR gxl_newsize(GBL_command_arguments *args) { IMPL_FORMATVALUE_CMD(args, "%i", custom_env(args).get_new_group_size()); }
439 static GB_ERROR gxl_unknown(GBL_command_arguments *args) { IMPL_FORMATVALUE_CMD(args, "%i", custom_match(args).get_unknown()); }
440 static GB_ERROR gxl_keeled(GBL_command_arguments *args) { IMPL_FORMATVALUE_CMD(args, "%i", int(custom_match(args).shouldHaveBeenKeeled())); }
441 
443  { "groupname", gxl_groupname },
444  { "penalty", gxl_penalty },
445  { "ingroup", gxl_ingroup },
446  { "outgroup", gxl_outgroup },
447  { "oldsize", gxl_oldsize },
448  { "newsize", gxl_newsize },
449  { "unknown", gxl_unknown },
450  { "keeled", gxl_keeled },
451  { NULp, NULp }
452 };
453 
455  static GBL_custom_command_lookup_table clt(groupXfer_command_table,
456  ARRAY_ELEMS(groupXfer_command_table)-1,
458  return clt;
459 }
460 
461 
462 
463 GB_ERROR SpecSetRegistry::write_node_information(FILE *log, bool delete_old_nodes, GroupsToTransfer what, const char *aci) {
464  GB_ERROR error = NULp;
465 
466  if (log) fputs("\nDetailed group changes:\n\n", log);
467 
468  GBL_env env(NULp, // have no item here
469  NULp, // use no treename
471 
472  for (long j=this->nsets-1; j>=0 && !error; j--) {
473  RSpecSet * const cset = this->sets[j];
474 
475  char *old_group_name = NULp; // !=NULp -> old node has been deleted
476  AP_tree *sourceNode = cset->matchedNode();
477  AP_tree *targetNode = cset->set_node;
478  bool insert_new_node = sourceNode && sourceNode->name;
479 
480  if (what == XFER_GROUPS_WITH_MARKED && insert_new_node) {
481  if (!targetNode->contains_marked_species()) insert_new_node = false; // @@@ this tests the wrong tree (the target tree)! (#792)
482  }
483 
484  if (targetNode->gb_node && (delete_old_nodes || insert_new_node)) { // There is already a node, delete old
485  if (!targetNode->name) {
486  GBDATA *gb_name = GB_entry(targetNode->gb_node, "group_name");
487  if (gb_name) {
488  targetNode->name = GB_read_string(gb_name);
489  }
490  else {
491  targetNode->name = ARB_strdup("<gb_node w/o name>"); // @@@ happens whenever any other node info is present (e.g. linewidth or angles; node gets deleted when delete_old_nodes==true); #793
492  }
493  }
494 
495  old_group_name = ARB_strdup(targetNode->name); // store old_group_name to rename new inserted node
496 
497  error = GB_delete(targetNode->gb_node);
498  if (!error) {
499  targetNode->gb_node = NULp;
500  freenull(targetNode->name);
501  }
502  }
503 
504  if (!error) {
505  if (insert_new_node) {
506  targetNode->gb_node = GB_create_container(targetNode->get_tree_root()->get_gb_tree(), "node");
507  error = GB_copy_dropProtectMarksAndTempstate(targetNode->gb_node, sourceNode->gb_node); // @@@ this copies more info than group-name (e.g. linewidth or angles; unwanted!); #793
508  // @@@ need a function which copies/moves/deletes group-related info (between two nodes). currently affected fields: 'group_name', 'grouped', 'keeled'; #793
509 
510  if (!error) {
511  GBDATA *gb_name = GB_entry(targetNode->gb_node, "group_name");
512  nt_assert(gb_name);
513  if (gb_name) {
514  const GroupPenalty& match = cset->bestMatch();
515 
516  int oldGroupsize = match.get_groupsize();
517  int newGroupsize = cset->size();
518 
519  char *new_group_name = NULp; // new wanted groupname
520  {
521  char *source_group_name = GB_read_string(gb_name); // existing groupname (in group-container copied from source tree)
522  if (aci) {
523  GroupXfer_callenv callEnv(env, source_group_name, match, oldGroupsize, newGroupsize);
524  new_group_name = GB_command_interpreter_in_env("", aci, callEnv);
525  if (!new_group_name) error = GB_await_error();
526  }
527  else {
528  reassign(new_group_name, source_group_name);
529  }
530  free(source_group_name);
531  }
532 
533  if (!error) {
534  nt_assert(new_group_name);
535  if (old_group_name) { // overwrite existing group
536  if (!delete_old_nodes) {
537  if (strcmp(old_group_name, new_group_name) != 0) { // old and new name differ
538  char *combined_name = GBS_global_string_copy("%s [was: %s]", new_group_name, old_group_name);
539  if (log) fprintf(log, "Destination group '%s' overwritten by '%s'", old_group_name, combined_name);
540  reassign(new_group_name, combined_name);
541  }
542  else { // name matches existing
543  if (log) fprintf(log, "Group '%s' remains unchanged", old_group_name);
544  }
545  }
546  else {
547  if (log) {
548  if (strcmp(old_group_name, new_group_name) == 0) {
549  fprintf(log, "Group '%s' remains unchanged", old_group_name);
550  }
551  else {
552  fprintf(log, "Destination group '%s' overwritten by '%s'", old_group_name, new_group_name);
553  }
554  }
555  }
556  }
557  else { // no group at destination -> insert
558  if (log) fprintf(log, "Group '%s' inserted", new_group_name);
559  }
560  }
561 
562  if (!error) error = GBT_write_group_name(gb_name, new_group_name, true); // always write wanted target name
563 
564  if (log) {
565  if (error) {
566  fprintf(log, " (Failed! Reason: %s)\n", error);
567  }
568  else if (match.isPerfectMatch()) {
569  fputc('\n', log);
570  }
571  else {
572  fprintf(log, " (not placed optimal; penalty=%f; in=%.1f%%/%i; out=%.1f%%/%i%s)\n",
573  match.get_penalty(),
574  match.get_ingroup_ratio() *100.0, oldGroupsize,
575  match.get_outgroup_ratio()*100.0, newGroupsize,
576  match.shouldHaveBeenKeeled() ? "; WARNING: matched inverse set, i.e. rest of tree!" : "");
577  }
578  }
579 
580  free(new_group_name);
581  }
582  }
583  }
584  else {
585  nt_assert(implicated(old_group_name, delete_old_nodes));
586  if (old_group_name && log) {
587  fprintf(log, "Destination group '%s' removed\n", old_group_name);
588  }
589  }
590  }
591  free(old_group_name);
592  }
593  return error;
594 }
595 
597  if (!error) error = progress->error_if_aborted();
598  progress->done();
599 }
600 
601 GB_ERROR NTREE_move_tree_info(GBDATA *gb_main, const char *tree_source, const char *tree_dest, const char *log_file, GroupTransferMode mode, GroupsToTransfer what, const GroupMatchScorer& scorer, const char *aci) {
602  GB_ERROR error = NULp;
603  FILE *log = NULp;
604 
605  if (!contradicted(mode == COMPARE_TOPOLOGY, log_file)) {
606  GBK_terminatef("invalid use of function NTREE_move_tree_info: mode=%i log_file=%s\n", int(mode), log_file);
607  }
608 
609  if (aci && !aci[0]) aci = NULp; // handle "empty ACI" like "no ACI"
610 
611  if (mode == COMPARE_TOPOLOGY) {
612  // info is written into 'tree_source' in .@find_best_matches_info
613  // (but we want to modify destination tree - like 'mode node info' does)
614  std::swap(tree_source, tree_dest);
615  }
616 
617  if (log_file) {
618  nt_assert(mode == REMOVE_EXISTING_GROUPS || mode == KEEP_OLD_NAMES);
619  log = fopen(log_file, "w");
620  if (log) {
621  const char *transferredGroups = what == XFER_GROUPS_WITH_MARKED ? "groups containing marked" : "all";
622  const char *overwriteMode = NULp;
623 
624  switch (mode) {
625  case COMPARE_TOPOLOGY: nt_assert(0); break;
626  case REMOVE_EXISTING_GROUPS: overwriteMode = "remove existing groups"; break;
627  case KEEP_OLD_NAMES: overwriteMode = "keep old names"; break;
628  }
629 
630  nt_assert(transferredGroups && overwriteMode);
631 
632  fprintf(log, // start of log
633  "LOGFILE: transfer group information\n"
634  "\n"
635  " Source tree '%s'\n"
636  "Destination tree '%s'\n"
637  "\n"
638  "transferred groups: %s\n"
639  " overwrite mode: %s\n"
640  "\n",
641  tree_source,
642  tree_dest,
643  transferredGroups,
644  overwriteMode);
645  }
646  else {
647  error = GB_IO_error("writing", log_file);
648  }
649  }
650 
651  GB_begin_transaction(gb_main);
652 
653  AP_tree_root rsource(new AliView(gb_main), NULp, false, NULp);
654  AP_tree_root rdest (new AliView(gb_main), NULp, false, NULp);
655  AP_tree_root& rsave = (mode == COMPARE_TOPOLOGY) ? rsource : rdest;
656  arb_progress progress(mode == COMPARE_TOPOLOGY ? "Compare topologies" : (mode == REMOVE_EXISTING_GROUPS ? "Copy group information" : "Add group information"));
657 
658  if (!error) error = scorer.check_validity();
659  if (!error) error = rsource.loadFromDB(tree_source);
660  if (!error) error = rsource.linkToDB(NULp, NULp);
661  if (!error) {
662  error = rdest.loadFromDB(tree_dest);
663  if (!error) error = rdest.linkToDB(NULp, NULp);
664  if (!error) {
665  AP_tree *source = rsource.get_root_node();
666  AP_tree *dest = rdest.get_root_node();
667 
668  int dest_leafs = dest->count_leafs();
669  int dest_nodes = leafs_2_nodes(dest_leafs, ROOTED);
670  int dest_inodes = dest_nodes-dest_leafs; // inner nodes
671 
672  int source_leafs = source->count_leafs();
673  int source_nodes = leafs_2_nodes(source_leafs, ROOTED);
674  int source_inodes = source_nodes-source_leafs; // inner nodes
675 
676  size_t compared_nodes = mode == COMPARE_TOPOLOGY ? source_inodes : source->count_clades();
677  size_t progress_steps = (compared_nodes + 2) * dest_inodes; // 2 for searching both sides of root
678 
679  arb_progress compare_progress(progress_steps);
680  compare_progress.subtitle("Register topology");
681 
682  {
683  SpecSetRegistry ssr(dest_leafs, &compare_progress, scorer);
684 
685  ssr.registerTree(dest);
686 
687  if (source_leafs < 3) error = GB_export_errorf("tree '%s' has less than 3 species", tree_source);
688  else {
689  compare_progress.subtitle("Match subtrees");
690 
691  TSpecSet *root_tsetl = ssr.find_best_matches_info(source->get_leftson(), log, mode == COMPARE_TOPOLOGY);
692  TSpecSet *root_tsetr = root_tsetl ? ssr.find_best_matches_info(source->get_rightson(), log, mode == COMPARE_TOPOLOGY) : NULp;
693 
694  if (root_tsetr) {
695  if (mode != COMPARE_TOPOLOGY) {
696  compare_progress.subtitle("Write group information");
697  error = ssr.write_node_information(log, mode == REMOVE_EXISTING_GROUPS, what, aci);
698  }
699 
700  if (!error) {
701  GroupPenalty dummy;
702 
703  ssr.setScorer(GroupMatchScorer()); // otherwise may find no matches
704  RSpecSet *new_root_rsetl = ssr.search_best_match(root_tsetl, dummy);
705  RSpecSet *new_root_rsetr = ssr.search_best_match(root_tsetr, dummy);
706 
707  nt_assert(new_root_rsetl && new_root_rsetr); // should always report matches (even really bad ones!)
708 
709  AP_tree *new_rootl = new_root_rsetl->set_node;
710  AP_tree *new_rootr = new_root_rsetr->set_node;
711 
712  // naive attempt to set root of target tree
713  // @@@ setting root wont work very well..
714  // - if these two nodes are NOT adjacent (esp. if 'new_rootr' is less "good" than 'new_rootl')
715  // - probably modifies topology even if not necessary; see ad_trees.cxx@NAIVE_ROOTING
716  new_rootl->set_root();
717  new_rootr->set_root();
718 
719  compare_progress.subtitle("Save trees");
720 
721  AP_tree *saved_root = (mode == COMPARE_TOPOLOGY) ? source : new_rootr->get_root_node();
722  error = GBT_overwrite_tree(rsave.get_gb_tree(), saved_root);
723 
724  if (!error) {
725  char *entry;
726  if (mode == COMPARE_TOPOLOGY) {
727  entry = GBS_global_string_copy("Compared topology with %s", tree_dest); // Note: the modified tree is 'tree_source'!
728  }
729  else {
730  const char *copiedOrAdded = mode == REMOVE_EXISTING_GROUPS ? "Copied" : "Added";
731 
732  entry = GBS_global_string_copy("%s node info %sfrom %s",
733  copiedOrAdded,
734  what == XFER_GROUPS_WITH_MARKED ? "of marked " : "",
735  tree_source);
736  }
737  GBT_log_to_tree_remark(rsave.get_gb_tree(), entry, true);
738  free(entry);
739  }
740  }
741  }
742 
743  delete root_tsetl;
744  delete root_tsetr;
745  }
746 
747  ssr.finish(error);
748  }
749  if (error) compare_progress.done();
750  }
751  }
752 
753  if (log) {
754  if (error) fprintf(log, "\nError: %s\n", error); // write error to log as well
755  else fputs("[done]\n", log);
756  fclose(log);
757  }
758 
759  return GB_end_transaction(gb_main, error);
760 }
761 
762 // --------------------------------------------------------------------------------
763 
764 #ifdef UNIT_TESTS
765 #ifndef TEST_UNIT_H
766 #include <test_unit.h>
767 #endif
768 
769 void TEST_species_sets() {
770  // SpecSetRegistry ssr(0, NULp); // does fatal error - fine
771 
772  GroupMatchScorer defaultScorer;
773  {
774  SpecSetRegistry s1(1, NULp, defaultScorer);
775  TEST_EXPECT_EQUAL(s1.bitstring_bytes(), 1);
776  TEST_EXPECT_EQUAL(s1.bitstring_longs(), 1);
777 
778  s1.add("one");
779  // s1.add("too much"); // fails assertion; fine
780  TSpecSet t1(NULp, s1, "one");
781  }
782  {
783  SpecSetRegistry s8(8, NULp, defaultScorer);
784  TEST_EXPECT_EQUAL(s8.bitstring_bytes(), 1);
785  TEST_EXPECT_EQUAL(s8.bitstring_longs(), 1);
786 
787  for (int i = 1; i<=8; ++i) {
788  s8.add(GBS_global_string("%i", i));
789  }
790  // s8.add("too much"); // fails assertion; fine
791  for (int i = 1; i<=8; ++i) {
792  TSpecSet t(NULp, s8, GBS_global_string("%i", i));
793  }
794  }
795  {
796  SpecSetRegistry s9(9, NULp, defaultScorer);
797  TEST_EXPECT_EQUAL(s9.bitstring_bytes(), 2);
798  TEST_EXPECT_EQUAL(s9.bitstring_longs(), 1);
799 
800  for (int i = 1; i<=9; ++i) {
801  s9.add(GBS_global_string("%i", i));
802  }
803  // s9.add("too much"); // fails assertion; fine
804  for (int i = 1; i<=9; ++i) {
805  TSpecSet t(NULp, s9, GBS_global_string("%i", i));
806  }
807  }
808 
809  {
810  SpecSetRegistry s64(64, NULp, defaultScorer);
811  TEST_EXPECT_EQUAL(s64.bitstring_bytes(), 8);
812  TEST_EXPECT_EQUAL(s64.bitstring_longs(), 1);
813 
814  for (int i = 1; i<=64; ++i) {
815  s64.add(GBS_global_string("%i", i));
816  }
817  // s64.add("too much"); // fails assertion; fine
818  for (int i = 1; i<=64; ++i) {
819  TSpecSet t(NULp, s64, GBS_global_string("%i", i));
820  }
821  }
822  {
823  SpecSetRegistry s65(65, NULp, defaultScorer);
824  TEST_EXPECT_EQUAL(s65.bitstring_bytes(), 9);
825  TEST_EXPECT_EQUAL(s65.bitstring_longs(), 2);
826 
827  for (int i = 1; i<=65; ++i) {
828  s65.add(GBS_global_string("%i", i));
829  }
830  // s65.add("too much"); // fails assertion; fine
831  for (int i = 1; i<=65; ++i) {
832  TSpecSet t(NULp, s65, GBS_global_string("%i", i));
833  }
834  }
835 }
836 
837 #endif // UNIT_TESTS
838 
839 // --------------------------------------------------------------------------------
GB_ERROR GB_begin_transaction(GBDATA *gbd)
Definition: arbdb.cxx:2516
GB_ERROR GB_copy_dropProtectMarksAndTempstate(GBDATA *dest, GBDATA *source)
Definition: arbdb.cxx:2144
Definition: arbdbt.h:48
int size() const
int get_old_group_size() const
const char * GB_ERROR
Definition: arb_core.h:25
static GB_ERROR gxl_outgroup(GBL_command_arguments *args)
long bitstring_bytes() const
#define implicated(hypothesis, conclusion)
Definition: arb_assert.h:289
long GBS_write_hash(GB_HASH *hs, const char *key, long val)
Definition: adhash.cxx:457
const TreeNode * get_root_node() const
Definition: TreeNode.h:382
double get_outgroup_ratio() const
Definition: NT_tree_cmp.h:94
virtual void set_root()
Definition: TreeNode.cxx:206
#define DOWNCAST_REFERENCE(totype, expr)
Definition: downcast.h:152
AP_tree * matchedNode() const
const GroupPenalty & get_match() const
GB_ERROR GB_end_transaction(GBDATA *gbd, GB_ERROR error)
Definition: arbdb.cxx:2549
void add(int v)
Definition: ClustalV.cxx:461
void storeBetterMatch(const GroupPenalty &match, AP_tree *matched_node)
GB_ERROR GB_IO_error(const char *action, const char *filename)
Definition: arb_msg.cxx:293
char * ARB_strdup(const char *str)
Definition: arb_string.h:27
TreeRoot * get_tree_root() const
Definition: TreeNode.h:380
GB_ERROR GBT_log_to_tree_remark(GBDATA *gb_tree, const char *log_entry, bool stamp)
Definition: adtree.cxx:491
double get_ingroup_ratio() const
Definition: NT_tree_cmp.h:93
const char * GBS_global_string(const char *templat,...)
Definition: arb_msg.cxx:204
void addPenalty(double p)
Definition: NT_tree_cmp.h:104
void GBK_terminatef(const char *templat,...)
Definition: arb_msg.cxx:477
bool betterThan(const GroupPenalty &other) const
Definition: NT_tree_cmp.h:89
STL namespace.
AP_tree * set_node
void GBS_free_hash(GB_HASH *hs)
Definition: adhash.cxx:541
static GB_ERROR gxl_keeled(GBL_command_arguments *args)
void use_as_remark(const SmartCharPtr &newRemark)
Definition: TreeNode.h:275
int get_groupsize() const
Definition: NT_tree_cmp.h:96
void finish(GB_ERROR &error)
#define ARRAY_ELEMS(array)
Definition: arb_defs.h:19
bool isPerfectMatch() const
Definition: NT_tree_cmp.h:92
const GBL_call_env & get_callEnv() const
Definition: gb_aci.h:234
const GroupPenalty & bestMatch() const
static GroupPenalty NoMatch()
Definition: NT_tree_cmp.h:101
static const GBL_command_lookup_table & get_GroupXfer_customized_ACI_commands()
void registerUnfound(const GroupMatchScorer &scorer, const TSpecSet &tset)
GB_ERROR GB_delete(GBDATA *&source)
Definition: arbdb.cxx:1904
unsigned count_leafs() const
Definition: AP_Tree.cxx:840
bool doesMatch() const
Definition: NT_tree_cmp.h:88
long bitstring_longs() const
CONSTEXPR_INLINE int leafs_2_nodes(int leafs, TreeModel model)
Definition: arbdbt.h:53
static GBL_command_definition groupXfer_command_table[]
GB_ERROR GB_await_error()
Definition: arb_msg.cxx:353
static GB_ERROR gxl_newsize(GBL_command_arguments *args)
GBDATA * GB_create_container(GBDATA *father, const char *key)
Definition: arbdb.cxx:1827
GB_ERROR GBT_overwrite_tree(GBDATA *gb_tree, TreeNode *tree)
Definition: adtree.cxx:480
GBDATA * get_gb_tree() const
Definition: ARB_Tree.hxx:82
bool shouldHaveBeenKeeled() const
Definition: NT_tree_cmp.h:99
bool is_leaf_set() const
bool aborted()
Definition: arb_progress.h:277
const char * get_group_name() const
const GroupXfer_callenv & custom_env(GBL_command_arguments *args)
static GB_ERROR gxl_ingroup(GBL_command_arguments *args)
long get_species_index(const char *species_name) const
GB_ERROR GBT_write_group_name(GBDATA *gb_group_name, const char *new_group_name, bool pedantic)
Definition: adtree.cxx:229
TSpecSet * find_best_matches_info(AP_tree *node, FILE *log, bool compare_node_info)
static void error(const char *msg)
Definition: mkptypes.cxx:96
RSpecSet * registerTree(AP_tree *node)
fputc('\n', stderr)
CONSTEXPR_INLINE_Cxx14 void swap(unsigned char &c1, unsigned char &c2)
Definition: ad_io_inline.h:19
GB_ERROR loadFromDB(const char *name) FINAL_OVERRIDE
Definition: AP_Tree.cxx:891
int get_unknown_members() const
GroupsToTransfer
Definition: NT_tree_cmp.h:27
int get_known_members() const
TSpecSet(AP_tree *nodei, const SpecSetRegistry &ssr, const char *species_name)
#define nt_assert(cond)
Definition: NT_local.h:27
GroupXfer_callenv(const GBL_env &env_, const char *name_, const GroupPenalty &match_, int oldGroupSize_, int newGroupSize_)
void setScorer(const GroupMatchScorer &newScorer)
int count_clades() const
Definition: TreeNode.cxx:772
void remove_remark()
Definition: TreeNode.h:285
GroupPenalty matchGroups(const TSpecSet &sourceSet, const RSpecSet &targetSet, long commonSpecies, long overallSpecies)
fputs(TRACE_PREFIX, stderr)
GB_ERROR GB_export_errorf(const char *templat,...)
Definition: arb_msg.cxx:264
bool is_leaf() const
Definition: TreeNode.h:171
TYPE * ARB_calloc(size_t nelem)
Definition: arb_mem.h:81
const GroupPenalty & custom_match(GBL_command_arguments *args)
GroupTransferMode
Definition: NT_tree_cmp.h:21
GB_ERROR linkToDB(int *zombies, int *duplicates) __ATTR__USERESULT
Definition: ARB_Tree.cxx:130
int known_members
char * name
Definition: TreeNode.h:134
void init(AP_tree *nodei, const SpecSetRegistry &ssr)
void subtitle(const char *stitle)
Definition: arb_progress.h:263
bool contains_marked_species()
Definition: ARB_Tree.cxx:198
char * GB_read_string(GBDATA *gbd)
Definition: arbdb.cxx:903
RSpecSet * search_best_match(const TSpecSet *tset, GroupPenalty &min_penalty)
SpecSetRegistry(long nspecies_, arb_progress *progress_, const GroupMatchScorer &scorer_)
Definition: NT_tree_cmp.cxx:22
#define IMPL_FORMATVALUE_CMD(args, fmt, value)
void set_remark(const char *newRemark)
Definition: TreeNode.h:279
void aw_message(const char *msg)
Definition: AW_status.cxx:932
double calcUnknownMembersPenalty(const TSpecSet &sourceSet) const
double get_penalty() const
Definition: NT_tree_cmp.h:91
#define NULp
Definition: cxxforward.h:97
RSpecSet(AP_tree *nodei, const SpecSetRegistry &ssr, const char *species_name)
static GB_ERROR gxl_groupname(GBL_command_arguments *args)
static GB_ERROR gxl_unknown(GBL_command_arguments *args)
SpecSet(AP_tree *nodei, const SpecSetRegistry &ssr, const char *species_name)
GB_ERROR check_validity() const
GB_ERROR write_node_information(FILE *log, bool delete_old_nodes, GroupsToTransfer what, const char *aci)
GB_ERROR NTREE_move_tree_info(GBDATA *gb_main, const char *tree_source, const char *tree_dest, const char *log_file, GroupTransferMode mode, GroupsToTransfer what, const GroupMatchScorer &scorer, const char *aci)
NOT4PERL char * GB_command_interpreter_in_env(const char *str, const char *commands, const GBL_call_env &callEnv)
Definition: gb_aci.cxx:361
bool isValid() const
Definition: NT_tree_cmp.h:45
unsigned char * allocate_bitstring() const
static GB_ERROR gxl_penalty(GBL_command_arguments *args)
GBDATA * gb_node
Definition: TreeNode.h:133
GBDATA * gb_main
Definition: adname.cxx:33
void mark_as_keeled()
Definition: NT_tree_cmp.h:103
int get_new_group_size() const
#define TEST_EXPECT_EQUAL(expr, want)
Definition: test_unit.h:1283
const GBL_command_lookup_table & ACI_get_standard_commands()
Definition: adlang1.cxx:2746
unsigned char * bitstring
long GBS_read_hash(const GB_HASH *hs, const char *key)
Definition: adhash.cxx:395
GBDATA * GB_entry(GBDATA *father, const char *key)
Definition: adquery.cxx:334
char * GBS_global_string_copy(const char *templat,...)
Definition: arb_msg.cxx:195
GB_HASH * GBS_create_hash(long estimated_elements, GB_CASE case_sens)
Definition: adhash.cxx:253
static GB_ERROR gxl_oldsize(GBL_command_arguments *args)