ARB
NT_tree_cmp.cxx
Go to the documentation of this file.
1 // =============================================================== //
2 // //
3 // File : NT_tree_cmp.cxx //
4 // Purpose : //
5 // //
6 // Institute of Microbiology (Technical University Munich) //
7 // http://www.arb-home.de/ //
8 // //
9 // =============================================================== //
10 
11 #include "NT_species_set.h"
12 #include <aw_msg.hxx>
13 #include <gb_aci.h>
14 #include <gb_aci_impl.h>
15 #include <arb_progress.h>
16 #include <arb_defs.h>
17 #include <arb_msg_nospam.h>
18 #include <string>
19 #include <climits>
20 
21 using namespace std;
22 
23 SpecSetRegistry::SpecSetRegistry(long nspecies_, arb_progress *progress_, const GroupMatchScorer& scorer_) :
24  species_counter(0),
25  nspecies(nspecies_),
26  nsets(0),
27  sets(ARB_calloc<RSpecSet*>(max_nsets())),
28  scorer(scorer_),
29  progress(progress_),
30  species_hash(GBS_create_hash(nspecies, GB_IGNORE_CASE))
31 {
32  for (int i=0; i<256; i++) {
33  int j = i;
34  int count = 0;
35  while (j) { // count bits
36  if (j&1) count ++;
37  j = j>>1;
38  }
39  set_bits[i] = count;
40  }
41  tmp_bitstring = allocate_bitstring();
42 }
43 
45  for (int i=0; i<nsets; i++) delete sets[i];
46  free(tmp_bitstring);
47  free(sets);
48  GBS_free_hash(species_hash);
49 }
50 
51 void SpecSetRegistry::add(const char *species_name) {
52  if (GBS_read_hash(species_hash, species_name)) {
53  aw_message(GBS_global_string("Warning: Species '%s' is found more than once in tree", species_name));
54  }
55  else {
56  GBS_write_hash(species_hash, species_name, ++species_counter);
57  nt_assert(species_counter<=nspecies); // more species added than planned
58  nt_assert(species_counter<=nspecies);
59  }
60 }
61 
62 void SpecSetRegistry::add(RSpecSet *rset) {
63  nt_assert(rset->size()>1); // do NOT register leafs (only inner nodes allowed)
64  nt_assert(nsets<max_nsets());
65  sets[nsets++] = rset;
66 }
67 
68 void SpecSetRegistry::dump_bitstring(const char *tag, unsigned char *bs) {
69  fprintf(stderr, "%s: ", tag);
70  const long bbytes = bitstring_bytes();
71  for (long i = 0; i<bbytes; ++i) {
72  int val = bs[i];
73  for (int b = 0; b<8; ++b) {
74  fputc(val&128 ? 'X' : '-', stderr);
75  val<<=1;
76  }
77  }
78  fputc('\n', stderr);
79 }
80 
81 GroupPenalty GroupMatchScorer::calcPenalty(long removed, long added, long commonSpecies) {
82  // - can be calculated from removed/added/commonSpecies
83  // - shall be reported (result param?)
84 
85  if (commonSpecies) {
86  long oldGroupSize = removed+commonSpecies;
87  long newGroupSize = added+commonSpecies;
88  double ingroupRatio = 1.0 - removed/double(oldGroupSize); // = percent of old members in new group
89  double outgroupRatio = added/double(newGroupSize); // = percent of non-group-members in new group
90 
91  if (insideLimits(ingroupRatio, outgroupRatio)) {
92  double penalty =
93  // abs. penalty:
94  removed*ingroupPep +
95  added*outgroupPep +
96  // rel. penalty:
97  (1-ingroupRatio)*ingroupInvRelPen +
98  outgroupRatio*outgroupRelPen;
99 
100  return GroupPenalty(penalty, ingroupRatio, outgroupRatio, oldGroupSize);
101  }
102  }
103  return GroupPenalty::NoMatch();
104 }
105 
106 GroupPenalty GroupMatchScorer::matchGroups(const TSpecSet& sourceSet, const RSpecSet& targetSet, long commonSpecies, long overallSpecies) {
115  long removed = sourceSet.get_known_members() - commonSpecies;
116  long added = targetSet.get_known_members() - commonSpecies;
117 
118  GroupPenalty match = calcPenalty(removed, added, commonSpecies);
119  nt_assert(implicated(match.doesMatch(), (removed+added)<overallSpecies));
120 
121  long targetSize_keeled = overallSpecies - targetSet.get_known_members();
122  long commonSpecies_keeled = sourceSet.get_known_members() - commonSpecies;
123  long removed_keeled = commonSpecies;
124  long added_keeled = targetSize_keeled - commonSpecies_keeled;
125 
126  GroupPenalty match_keeled = calcPenalty(removed_keeled, added_keeled, commonSpecies_keeled);
127  match_keeled.mark_as_keeled();
128  if (match_keeled.doesMatch()) { // do not add to NoMatch!
129  match_keeled.addPenalty(keelPenalty);
130  }
131  nt_assert(implicated(match_keeled.doesMatch(), (removed_keeled+added_keeled)<overallSpecies));
132 
133  if (match_keeled.betterThan(match)) {
134  // @@@ if this happens, the match semantic is inverted: if moving a group -> group should be keeled (by caller)!
135  // related to #512 + #451
136  return match_keeled;
137  }
138 
139  return match;
140 }
141 
143  return sourceSet.get_unknown_members() * unfoundPep; // penalty for unknown members in source tree
144 }
145 
147  if (ingroupPep == 0.0 && ingroupInvRelPen == 0.0) {
148  return "one ingroup penalty has to be different from zero";
149  }
150  if (outgroupPep == 0.0 && outgroupRelPen == 0.0) {
151  return "one outgroup penalty has to be different from zero";
152  }
153 
154  if (ingroupPep<0.0 || outgroupPep<0.0 || ingroupInvRelPen<0.0 || outgroupRelPen<0.0) {
155  return "invalid negative in/outgroup penalty";
156  }
157 
158  if (!ingroup.isValid() || !outgroup.isValid()) {
159  return "invalid limits";
160  }
161 
162  return NULp;
163 }
164 
166  // returns best match for 'tset' (against all registered sets).
167  // NULp if no match found.
168  // sets 'min_penalty' to penalty found for returned set.
169 
170  RSpecSet *bset = NULp;
171  min_penalty = GroupPenalty::NoMatch();
172 
173  const long bbytes = bitstring_bytes();
174  const long blongs = bitstring_longs();
175 
176  unsigned char * const tbs = tset->bitstring;
177  long * const tbl = (long*)tbs;
178  long * const tmp_bl = (long*)tmp_bitstring;
179 
180  for (long i=nsets-1; i>=0; i--) {
181  long same = 0; // amount of known members present in both sets
182 
183  RSpecSet *rset = sets[i];
184 
185  unsigned char * const rbs = rset->bitstring;
186  long * const rbl = (long*)rbs;
187 
188  for (long j = 0; j<blongs; ++j) { // LOOP_VECTORIZED
189  tmp_bl[j] = tbl[j] & rbl[j];
190  }
191  for (long j = 0; j<bbytes; ++j) { // (Note: failed to optimize: gcc wont vectorize double-indexed access)
192  same += set_bits[tmp_bitstring[j]];
193  }
194 
195  GroupPenalty penalty = scorer.matchGroups(*tset, *rset, same, nspecies);
196  if (penalty.betterThan(min_penalty)) {
197  min_penalty = penalty;
198  bset = rset;
199  }
200  progress->inc();
201  }
202 
203  return bset;
204 }
205 
206 void GroupPenalty::registerUnfound(const GroupMatchScorer& scorer, const TSpecSet& tset) {
208  nt_assert(unknown == -1); // registerUnfound shall be called exactly once!
209  unknown = tset.get_unknown_members();
210 }
211 
212 double SpecSetRegistry::search_and_remember_best_match_and_log_errors(const TSpecSet *tset, FILE *log) {
225  GroupPenalty match;
226  RSpecSet *bset = search_best_match(tset, match);
227 
228  if (!bset) {
229  // no good match found.
230  // atm this happens for small groups (e.g. with 2 species) + a special placement of these species in tree.
231  // in the future 'finding no match' will get normal behavior.
232  return -1;
233  }
234 
235  AP_tree *earlierFoundGroup = bset->matchedNode(); // group previously considered for same destination node
236  AP_tree *currentGroup = tset->set_node;
237 
238  match.registerUnfound(scorer, *tset);
239  if (match.betterThan(bset->bestMatch())) {
240  nt_assert(!bset->bestMatch().isPerfectMatch()); // shall never supersede optimally placed group!
241  if (earlierFoundGroup && log) {
242  nt_assert(earlierFoundGroup->name && currentGroup->name);
243  fprintf(log, "Group '%s' skipped (got superseded by group '%s'; same best target positions)\n",
244  earlierFoundGroup->name,
245  currentGroup->name);
246  }
247 
248  bset->storeBetterMatch(match, currentGroup);
249  }
250  else {
251  nt_assert(!match.isPerfectMatch()); // found two groups with optimal placement. should be impossible by design! can be caused by invalid penalties
252  if (log) {
253  nt_assert(earlierFoundGroup->name && currentGroup->name);
254  fprintf(log, "Group '%s' skipped (does NOT supersede group '%s'; same best target positions)\n",
255  currentGroup->name,
256  earlierFoundGroup->name);
257  }
258  }
259 
260  return match.get_penalty();
261 }
262 
263 void SpecSet::init(AP_tree *nodei, const SpecSetRegistry& ssr) {
265  known_members = 0;
266  set_node = nodei;
267 }
268 
269 SpecSet::SpecSet(AP_tree *nodei, const SpecSetRegistry& ssr, const char *species_name) {
270  init(nodei, ssr);
271 
272  long species_index = ssr.get_species_index(species_name);
273  if (species_index) {
274  nt_assert(species_index>0);
275  --species_index; // [1..N] -> [0..N-1]
276  nt_assert((species_index/8) < ssr.bitstring_bytes());
277  bitstring[species_index/8] |= 1 << (species_index % 8);
278  known_members = 1;
279  }
280 }
281 
282 SpecSet::SpecSet(AP_tree *nodei, const SpecSetRegistry& ssr, const SpecSet *l, const SpecSet *r) {
283  init(nodei, ssr);
284 
285  const long *lbs = (const long *)l->bitstring;
286  const long *rbs = (const long *)r->bitstring;
287  long *dbs = (long *)bitstring;
288 
289  for (long j = ssr.bitstring_longs()-1; j>=0; j--) { // LOOP_VECTORIZED=2
290  dbs[j] = lbs[j] | rbs[j];
291  }
293 }
294 
296  free(bitstring);
297 }
298 
299 RSpecSet::RSpecSet(AP_tree *nodei, const SpecSetRegistry& ssr, const char *species_name) :
300  SpecSet(nodei, ssr, species_name),
301  best_node(NULp)
302 {}
303 RSpecSet::RSpecSet(AP_tree *nodei, const SpecSetRegistry& ssr, const RSpecSet *l, const RSpecSet *r) :
304  SpecSet(nodei, ssr, l, r),
305  best_node(NULp)
306 {}
307 TSpecSet::TSpecSet(AP_tree *nodei, const SpecSetRegistry& ssr, const char *species_name) :
308  SpecSet(nodei, ssr, species_name),
309  unfound_species_count(1-known_members)
310 {
311 }
312 TSpecSet::TSpecSet(AP_tree *nodei, const SpecSetRegistry& ssr, const TSpecSet *l, const TSpecSet *r) :
313  SpecSet(nodei, ssr, l, r),
314  unfound_species_count(l->unfound_species_count + r->unfound_species_count)
315 {
316 }
317 
319  RSpecSet *ss;
320  // Warning: confusing resource handling:
321  // - leafs are returned "NOT owned by anybody"
322  // - inner nodes are added to and owned by this->sets
323 
324  if (node->is_leaf()) {
325  this->add(node->name);
326  ss = new RSpecSet(node, *this, node->name);
327  nt_assert(ss->is_leaf_set());
328  }
329  else {
330  RSpecSet *ls = registerTree(node->get_leftson());
331  RSpecSet *rs = registerTree(node->get_rightson());
332 
333  ss = new RSpecSet(node, *this, ls, rs);
334  this->add(ss);
335  nt_assert(!ss->is_leaf_set());
336 
337  if (ls->is_leaf_set()) delete ls;
338  if (rs->is_leaf_set()) delete rs;
339  }
340  return ss;
341 }
342 
343 TSpecSet *SpecSetRegistry::find_best_matches_info(AP_tree *node, FILE *log, bool compare_node_info) {
344  /* Go through all nodes of subtree 'node' (part of the source tree if copying info)
345  * and search for the best matching registered node (in this->sets; which belong to dest tree if copying info).
346  *
347  * [Note: If comparing topologies, source- and dest-trees were exchanged by caller.]
348  *
349  * Named groups (if copying info) resp. all nodes (if comparing topologies)
350  * get compared versus all registered nodes and the best matching node is detected.
351  * The compared node is stored inside the best matching node.
352  */
353 
354  nt_assert(contradicted(log, compare_node_info));
355 
356  TSpecSet *ss = NULp;
357  if (node->is_leaf()) {
358  ss = new TSpecSet(node, *this, node->name);
359  }
360  else {
361  TSpecSet *ls = find_best_matches_info(node->get_leftson(), log, compare_node_info);
362  TSpecSet *rs = ls ? find_best_matches_info(node->get_rightson(), log, compare_node_info) : NULp;
363 
364  if (rs) {
365  ss = new TSpecSet(node, *this, ls, rs);
366  if (compare_node_info) {
367  nt_assert(!log); // does not work / not allowed
368 
369  double penalty = search_and_remember_best_match_and_log_errors(ss, log);
370  int ipenalty = int(penalty);
371 
372  // the #-sign is important (otherwise TREE_write_Newick will not work correctly; interference with bootstrap values; refer to #787 + #767)
373  if (ipenalty>0) node->use_as_remark(GBS_global_string_copy("# %i", ipenalty));
374  else if (ipenalty<0) node->set_remark("# no match");
375  else node->remove_remark();
376  }
377  else {
378  if (node->name) {
379  double penalty = search_and_remember_best_match_and_log_errors(ss, log);
380  if (penalty<0) {
381  nt_assert(log);
382  fprintf(log, "Group '%s' doesn't fit to any destination subtree.\n", node->name);
383  }
384  }
385  }
386  }
387  delete rs;
388  delete ls;
389  }
390 
391  if (ss && progress->aborted()) {
392  delete ss;
393  ss = NULp; // abort recursion after user-abort
394  }
395  return ss;
396 }
397 
398 // -----------------------------
399 // local ACI extension
400 
402  RefPtr<const char> name;
403  const GroupPenalty& match;
404 
405  int oldGroupSize;
406  int newGroupSize;
407 
408 public:
409  GroupXfer_callenv(const GBL_env& env_, const char *name_, const GroupPenalty& match_, int oldGroupSize_, int newGroupSize_) :
410  GBL_call_env(NULp, env_),
411  name(name_),
412  match(match_),
413  oldGroupSize(oldGroupSize_),
414  newGroupSize(newGroupSize_)
415  {}
416 
417  const char *get_group_name() const { return name; }
418  const GroupPenalty& get_match() const { return match; }
419  int get_old_group_size() const { return oldGroupSize; }
420  int get_new_group_size() const { return newGroupSize; }
421 };
422 
425 
426 using namespace GBL_IMPL;
427 
428 #define IMPL_FORMATVALUE_CMD(args,fmt,value) \
429  COMMAND_DROPS_INPUT_STREAMS(args); \
430  GB_ERROR error = check_no_parameter(args); \
431  if (!error) FORMAT_2_OUT(args, fmt, value); \
432  return error
433 
434 static GB_ERROR gxl_groupname(GBL_command_arguments *args) { IMPL_FORMATVALUE_CMD(args, "%s", custom_gx_env(args).get_group_name()); }
435 static GB_ERROR gxl_penalty(GBL_command_arguments *args) { IMPL_FORMATVALUE_CMD(args, "%f", custom_match(args).get_penalty()); }
436 static GB_ERROR gxl_ingroup(GBL_command_arguments *args) { IMPL_FORMATVALUE_CMD(args, "%.1f%%", custom_match(args).get_ingroup_ratio()*100.0); }
437 static GB_ERROR gxl_outgroup(GBL_command_arguments *args) { IMPL_FORMATVALUE_CMD(args, "%.1f%%", custom_match(args).get_outgroup_ratio()*100.0); }
438 static GB_ERROR gxl_oldsize(GBL_command_arguments *args) { IMPL_FORMATVALUE_CMD(args, "%i", custom_gx_env(args).get_old_group_size()); }
439 static GB_ERROR gxl_newsize(GBL_command_arguments *args) { IMPL_FORMATVALUE_CMD(args, "%i", custom_gx_env(args).get_new_group_size()); }
440 static GB_ERROR gxl_unknown(GBL_command_arguments *args) { IMPL_FORMATVALUE_CMD(args, "%i", custom_match(args).get_unknown()); }
441 static GB_ERROR gxl_keeled(GBL_command_arguments *args) { IMPL_FORMATVALUE_CMD(args, "%i", int(custom_match(args).shouldHaveBeenKeeled())); }
442 
444  { "groupname", gxl_groupname },
445  { "penalty", gxl_penalty },
446  { "ingroup", gxl_ingroup },
447  { "outgroup", gxl_outgroup },
448  { "oldsize", gxl_oldsize },
449  { "newsize", gxl_newsize },
450  { "unknown", gxl_unknown },
451  { "keeled", gxl_keeled },
452  { NULp, NULp }
453 };
454 
456  static GBL_custom_command_lookup_table clt(groupXfer_command_table,
457  ARRAY_ELEMS(groupXfer_command_table)-1,
459  return clt;
460 }
461 
462 
463 
464 GB_ERROR SpecSetRegistry::write_node_information(FILE *log, bool delete_old_nodes, GroupsToTransfer what, const char *aci) {
465  GB_ERROR error = NULp;
466  MessageSpamFilter suppress("problematic group names");
467 
468  if (log) fputs("\nDetailed group changes:\n\n", log);
469 
470  GBL_env env(NULp, // have no item here
471  NULp, // use no treename
473 
474  for (long j=this->nsets-1; j>=0 && !error; j--) {
475  RSpecSet * const cset = this->sets[j];
476 
477  char *old_group_name = NULp; // !=NULp -> old node has been deleted
478  AP_tree *sourceNode = cset->matchedNode();
479  AP_tree *targetNode = cset->set_node;
480  bool insert_new_node = sourceNode && sourceNode->name;
481 
482  if (what == XFER_GROUPS_WITH_MARKED && insert_new_node) {
483  if (!targetNode->contains_marked_species()) insert_new_node = false; // @@@ this tests the wrong tree (the target tree)! (#792)
484  }
485 
486  if (targetNode->gb_node && (delete_old_nodes || insert_new_node)) { // There is already a node, delete old
487  if (!targetNode->name) {
488  GBDATA *gb_name = GB_entry(targetNode->gb_node, "group_name");
489  if (gb_name) {
490  targetNode->name = GB_read_string(gb_name);
491  }
492  else {
493  targetNode->name = ARB_strdup("<gb_node w/o name>"); // @@@ happens whenever any other node info is present (e.g. linewidth or angles; node gets deleted when delete_old_nodes==true); #793
494  }
495  }
496 
497  old_group_name = ARB_strdup(targetNode->name); // store old_group_name to rename new inserted node
498 
499  error = GB_delete(targetNode->gb_node);
500  if (!error) {
501  targetNode->gb_node = NULp;
502  freenull(targetNode->name);
503  }
504  }
505 
506  if (!error) {
507  if (insert_new_node) {
508  targetNode->gb_node = GB_create_container(targetNode->get_tree_root()->get_gb_tree(), "node");
509  error = GB_copy_dropProtectMarksAndTempstate(targetNode->gb_node, sourceNode->gb_node); // @@@ this copies more info than group-name (e.g. linewidth or angles; unwanted!); #793
510  // @@@ need a function which copies/moves/deletes group-related info (between two nodes). currently affected fields: 'group_name', 'grouped', 'keeled'; #793
511 
512  if (!error) {
513  GBDATA *gb_name = GB_entry(targetNode->gb_node, "group_name");
514  nt_assert(gb_name);
515  if (gb_name) {
516  const GroupPenalty& match = cset->bestMatch();
517 
518  int oldGroupsize = match.get_groupsize();
519  int newGroupsize = cset->size();
520 
521  char *new_group_name = NULp; // new wanted groupname
522  {
523  char *source_group_name = GB_read_string(gb_name); // existing groupname (in group-container copied from source tree)
524  if (aci) {
525  GroupXfer_callenv callEnv(env, source_group_name, match, oldGroupsize, newGroupsize);
526  new_group_name = GB_command_interpreter_in_env("", aci, callEnv);
527  if (!new_group_name) error = GB_await_error();
528  }
529  else {
530  reassign(new_group_name, source_group_name);
531  }
532  free(source_group_name);
533  }
534 
535  if (!error) {
536  nt_assert(new_group_name);
537  if (old_group_name) { // overwrite existing group
538  if (!delete_old_nodes) {
539  if (strcmp(old_group_name, new_group_name) != 0) { // old and new name differ
540  char *combined_name = GBS_global_string_copy("%s [was: %s]", new_group_name, old_group_name);
541  if (log) fprintf(log, "Destination group '%s' overwritten by '%s'", old_group_name, combined_name);
542  reassign(new_group_name, combined_name);
543  }
544  else { // name matches existing
545  if (log) fprintf(log, "Group '%s' remains unchanged", old_group_name);
546  }
547  }
548  else {
549  if (log) {
550  if (strcmp(old_group_name, new_group_name) == 0) {
551  fprintf(log, "Group '%s' remains unchanged", old_group_name);
552  }
553  else {
554  fprintf(log, "Destination group '%s' overwritten by '%s'", old_group_name, new_group_name);
555  }
556  }
557  }
558  }
559  else { // no group at destination -> insert
560  if (log) fprintf(log, "Group '%s' inserted", new_group_name);
561  }
562  }
563 
564  if (!error) error = GBT_write_group_name(gb_name, new_group_name, true); // always write wanted target name
565 
566  if (log) {
567  if (error) {
568  fprintf(log, " (Failed! Reason: %s)\n", error);
569  }
570  else if (match.isPerfectMatch()) {
571  fputc('\n', log);
572  }
573  else {
574  fprintf(log, " (not placed optimal; penalty=%f; in=%.1f%%/%i; out=%.1f%%/%i%s)\n",
575  match.get_penalty(),
576  match.get_ingroup_ratio() *100.0, oldGroupsize,
577  match.get_outgroup_ratio()*100.0, newGroupsize,
578  match.shouldHaveBeenKeeled() ? "; WARNING: matched inverse set, i.e. rest of tree!" : "");
579  }
580  }
581 
582  free(new_group_name);
583  }
584  }
585  }
586  else {
587  nt_assert(implicated(old_group_name, delete_old_nodes));
588  if (old_group_name && log) {
589  fprintf(log, "Destination group '%s' removed\n", old_group_name);
590  }
591  }
592  }
593  free(old_group_name);
594  }
595  return error;
596 }
597 
599  if (!error) error = progress->error_if_aborted();
600  progress->done();
601 }
602 
603 GB_ERROR NTREE_move_tree_info(GBDATA *gb_main, const char *tree_source, const char *tree_dest, const char *log_file, GroupTransferMode mode, GroupsToTransfer what, const GroupMatchScorer& scorer, const char *aci) {
604  GB_ERROR error = NULp;
605  FILE *log = NULp;
606 
607  if (!contradicted(mode == COMPARE_TOPOLOGY, log_file)) {
608  GBK_terminatef("invalid use of function NTREE_move_tree_info: mode=%i log_file=%s\n", int(mode), log_file);
609  }
610 
611  if (aci && !aci[0]) aci = NULp; // handle "empty ACI" like "no ACI"
612 
613  if (mode == COMPARE_TOPOLOGY) {
614  // info is written into 'tree_source' in .@find_best_matches_info
615  // (but we want to modify destination tree - like 'mode node info' does)
616  std::swap(tree_source, tree_dest);
617  }
618 
619  if (log_file) {
620  nt_assert(mode == REMOVE_EXISTING_GROUPS || mode == KEEP_OLD_NAMES);
621  log = fopen(log_file, "w");
622  if (log) {
623  const char *transferredGroups = what == XFER_GROUPS_WITH_MARKED ? "groups containing marked" : "all";
624  const char *overwriteMode = NULp;
625 
626  switch (mode) {
627  case COMPARE_TOPOLOGY: nt_assert(0); break;
628  case REMOVE_EXISTING_GROUPS: overwriteMode = "remove existing groups"; break;
629  case KEEP_OLD_NAMES: overwriteMode = "keep old names"; break;
630  }
631 
632  nt_assert(transferredGroups && overwriteMode);
633 
634  fprintf(log, // start of log
635  "LOGFILE: transfer group information\n"
636  "\n"
637  " Source tree '%s'\n"
638  "Destination tree '%s'\n"
639  "\n"
640  "transferred groups: %s\n"
641  " overwrite mode: %s\n"
642  "\n",
643  tree_source,
644  tree_dest,
645  transferredGroups,
646  overwriteMode);
647  }
648  else {
649  error = GB_IO_error("writing", log_file);
650  }
651  }
652 
653  GB_begin_transaction(gb_main);
654 
655  AP_tree_root rsource(new AliView(gb_main), NULp, false, NULp);
656  AP_tree_root rdest (new AliView(gb_main), NULp, false, NULp);
657  AP_tree_root& rsave = (mode == COMPARE_TOPOLOGY) ? rsource : rdest;
658  arb_progress progress(mode == COMPARE_TOPOLOGY ? "Compare topologies" : (mode == REMOVE_EXISTING_GROUPS ? "Copy group information" : "Add group information"));
659 
660  if (!error) error = scorer.check_validity();
661  if (!error) error = rsource.loadFromDB(tree_source);
662  if (!error) error = rsource.linkToDB(NULp, NULp);
663  if (!error) {
664  error = rdest.loadFromDB(tree_dest);
665  if (!error) error = rdest.linkToDB(NULp, NULp);
666  if (!error) {
667  AP_tree *source = rsource.get_root_node();
668  AP_tree *dest = rdest.get_root_node();
669 
670  int dest_leafs = dest->count_leafs();
671  int dest_nodes = leafs_2_nodes(dest_leafs, ROOTED);
672  int dest_inodes = dest_nodes-dest_leafs; // inner nodes
673 
674  int source_leafs = source->count_leafs();
675  int source_nodes = leafs_2_nodes(source_leafs, ROOTED);
676  int source_inodes = source_nodes-source_leafs; // inner nodes
677 
678  size_t compared_nodes = mode == COMPARE_TOPOLOGY ? source_inodes : source->count_clades();
679  size_t progress_steps = (compared_nodes + 2) * dest_inodes; // 2 for searching both sides of root
680 
681  arb_progress compare_progress(progress_steps);
682  compare_progress.subtitle("Register topology");
683 
684  {
685  SpecSetRegistry ssr(dest_leafs, &compare_progress, scorer);
686 
687  ssr.registerTree(dest);
688 
689  if (source_leafs < 3) error = GB_export_errorf("tree '%s' has less than 3 species", tree_source);
690  else {
691  compare_progress.subtitle("Match subtrees");
692 
693  TSpecSet *root_tsetl = ssr.find_best_matches_info(source->get_leftson(), log, mode == COMPARE_TOPOLOGY);
694  TSpecSet *root_tsetr = root_tsetl ? ssr.find_best_matches_info(source->get_rightson(), log, mode == COMPARE_TOPOLOGY) : NULp;
695 
696  if (root_tsetr) {
697  if (mode != COMPARE_TOPOLOGY) {
698  compare_progress.subtitle("Write group information");
699  error = ssr.write_node_information(log, mode == REMOVE_EXISTING_GROUPS, what, aci);
700  }
701 
702  if (!error) {
703  GroupPenalty dummy;
704 
705  ssr.setScorer(GroupMatchScorer()); // otherwise may find no matches
706  RSpecSet *new_root_rsetl = ssr.search_best_match(root_tsetl, dummy);
707  RSpecSet *new_root_rsetr = ssr.search_best_match(root_tsetr, dummy);
708 
709  nt_assert(new_root_rsetl && new_root_rsetr); // should always report matches (even really bad ones!)
710 
711  AP_tree *new_rootl = new_root_rsetl->set_node;
712  AP_tree *new_rootr = new_root_rsetr->set_node;
713 
714  // naive attempt to set root of target tree
715  // @@@ setting root wont work very well..
716  // - if these two nodes are NOT adjacent (esp. if 'new_rootr' is less "good" than 'new_rootl')
717  // - probably modifies topology even if not necessary; see ad_trees.cxx@NAIVE_ROOTING
718  new_rootl->set_root();
719  new_rootr->set_root();
720 
721  compare_progress.subtitle("Save trees");
722 
723  AP_tree *saved_root = (mode == COMPARE_TOPOLOGY) ? source : new_rootr->get_root_node();
724  error = GBT_overwrite_tree(rsave.get_gb_tree(), saved_root);
725 
726  if (!error) {
727  char *entry;
728  if (mode == COMPARE_TOPOLOGY) {
729  entry = GBS_global_string_copy("Compared topology with %s", tree_dest); // Note: the modified tree is 'tree_source'!
730  }
731  else {
732  const char *copiedOrAdded = mode == REMOVE_EXISTING_GROUPS ? "Copied" : "Added";
733 
734  entry = GBS_global_string_copy("%s node info %sfrom %s",
735  copiedOrAdded,
736  what == XFER_GROUPS_WITH_MARKED ? "of marked " : "",
737  tree_source);
738  }
739  GBT_log_to_tree_remark(rsave.get_gb_tree(), entry, true);
740  free(entry);
741  }
742  }
743  }
744 
745  delete root_tsetl;
746  delete root_tsetr;
747  }
748 
749  ssr.finish(error);
750  }
751  if (error) compare_progress.done();
752  }
753  }
754 
755  if (log) {
756  if (error) fprintf(log, "\nError: %s\n", error); // write error to log as well
757  else fputs("[done]\n", log);
758  fclose(log);
759  }
760 
761  return GB_end_transaction(gb_main, error);
762 }
763 
764 // --------------------------------------------------------------------------------
765 
766 #ifdef UNIT_TESTS
767 #ifndef TEST_UNIT_H
768 #include <test_unit.h>
769 #endif
770 
771 void TEST_species_sets() {
772  // SpecSetRegistry ssr(0, NULp); // does fatal error - fine
773 
774  GroupMatchScorer defaultScorer;
775  {
776  SpecSetRegistry s1(1, NULp, defaultScorer);
777  TEST_EXPECT_EQUAL(s1.bitstring_bytes(), 1);
778  TEST_EXPECT_EQUAL(s1.bitstring_longs(), 1);
779 
780  s1.add("one");
781  // s1.add("too much"); // fails assertion; fine
782  TSpecSet t1(NULp, s1, "one");
783  }
784  {
785  SpecSetRegistry s8(8, NULp, defaultScorer);
786  TEST_EXPECT_EQUAL(s8.bitstring_bytes(), 1);
787  TEST_EXPECT_EQUAL(s8.bitstring_longs(), 1);
788 
789  for (int i = 1; i<=8; ++i) {
790  s8.add(GBS_global_string("%i", i));
791  }
792  // s8.add("too much"); // fails assertion; fine
793  for (int i = 1; i<=8; ++i) {
794  TSpecSet t(NULp, s8, GBS_global_string("%i", i));
795  }
796  }
797  {
798  SpecSetRegistry s9(9, NULp, defaultScorer);
799  TEST_EXPECT_EQUAL(s9.bitstring_bytes(), 2);
800  TEST_EXPECT_EQUAL(s9.bitstring_longs(), 1);
801 
802  for (int i = 1; i<=9; ++i) {
803  s9.add(GBS_global_string("%i", i));
804  }
805  // s9.add("too much"); // fails assertion; fine
806  for (int i = 1; i<=9; ++i) {
807  TSpecSet t(NULp, s9, GBS_global_string("%i", i));
808  }
809  }
810 
811  {
812  SpecSetRegistry s64(64, NULp, defaultScorer);
813  TEST_EXPECT_EQUAL(s64.bitstring_bytes(), 8);
814  TEST_EXPECT_EQUAL(s64.bitstring_longs(), 1);
815 
816  for (int i = 1; i<=64; ++i) {
817  s64.add(GBS_global_string("%i", i));
818  }
819  // s64.add("too much"); // fails assertion; fine
820  for (int i = 1; i<=64; ++i) {
821  TSpecSet t(NULp, s64, GBS_global_string("%i", i));
822  }
823  }
824  {
825  SpecSetRegistry s65(65, NULp, defaultScorer);
826  TEST_EXPECT_EQUAL(s65.bitstring_bytes(), 9);
827  TEST_EXPECT_EQUAL(s65.bitstring_longs(), 2);
828 
829  for (int i = 1; i<=65; ++i) {
830  s65.add(GBS_global_string("%i", i));
831  }
832  // s65.add("too much"); // fails assertion; fine
833  for (int i = 1; i<=65; ++i) {
834  TSpecSet t(NULp, s65, GBS_global_string("%i", i));
835  }
836  }
837 }
838 
839 #endif // UNIT_TESTS
840 
841 // --------------------------------------------------------------------------------
GB_ERROR GB_begin_transaction(GBDATA *gbd)
Definition: arbdb.cxx:2528
GB_ERROR GB_copy_dropProtectMarksAndTempstate(GBDATA *dest, GBDATA *source)
Definition: arbdb.cxx:2152
Definition: arbdbt.h:48
int size() const
int get_old_group_size() const
const char * GB_ERROR
Definition: arb_core.h:25
static GB_ERROR gxl_outgroup(GBL_command_arguments *args)
long bitstring_bytes() const
#define implicated(hypothesis, conclusion)
Definition: arb_assert.h:289
long GBS_write_hash(GB_HASH *hs, const char *key, long val)
Definition: adhash.cxx:454
const TreeNode * get_root_node() const
Definition: TreeNode.h:423
double get_outgroup_ratio() const
Definition: NT_tree_cmp.h:94
virtual void set_root()
Definition: TreeNode.cxx:206
#define DOWNCAST_REFERENCE(totype, expr)
Definition: downcast.h:152
AP_tree * matchedNode() const
const GroupPenalty & get_match() const
GB_ERROR GB_end_transaction(GBDATA *gbd, GB_ERROR error)
Definition: arbdb.cxx:2561
void add(int v)
Definition: ClustalV.cxx:461
void storeBetterMatch(const GroupPenalty &match, AP_tree *matched_node)
GB_ERROR GB_IO_error(const char *action, const char *filename)
Definition: arb_msg.cxx:285
char * ARB_strdup(const char *str)
Definition: arb_string.h:27
TreeRoot * get_tree_root() const
Definition: TreeNode.h:421
GB_ERROR GBT_log_to_tree_remark(GBDATA *gb_tree, const char *log_entry, bool stamp)
Definition: adtree.cxx:537
double get_ingroup_ratio() const
Definition: NT_tree_cmp.h:93
const char * GBS_global_string(const char *templat,...)
Definition: arb_msg.cxx:203
void addPenalty(double p)
Definition: NT_tree_cmp.h:104
void GBK_terminatef(const char *templat,...)
Definition: arb_msg.cxx:523
bool betterThan(const GroupPenalty &other) const
Definition: NT_tree_cmp.h:89
STL namespace.
AP_tree * set_node
void GBS_free_hash(GB_HASH *hs)
Definition: adhash.cxx:538
static GB_ERROR gxl_keeled(GBL_command_arguments *args)
void use_as_remark(const SmartCharPtr &newRemark)
Definition: TreeNode.h:316
int get_groupsize() const
Definition: NT_tree_cmp.h:96
void finish(GB_ERROR &error)
#define ARRAY_ELEMS(array)
Definition: arb_defs.h:19
bool isPerfectMatch() const
Definition: NT_tree_cmp.h:92
const GBL_call_env & get_callEnv() const
Definition: gb_aci.h:234
const GroupPenalty & bestMatch() const
static GroupPenalty NoMatch()
Definition: NT_tree_cmp.h:101
static const GBL_command_lookup_table & get_GroupXfer_customized_ACI_commands()
void registerUnfound(const GroupMatchScorer &scorer, const TSpecSet &tset)
GB_ERROR GB_delete(GBDATA *&source)
Definition: arbdb.cxx:1916
unsigned count_leafs() const
Definition: AP_Tree.cxx:840
bool doesMatch() const
Definition: NT_tree_cmp.h:88
long bitstring_longs() const
CONSTEXPR_INLINE int leafs_2_nodes(int leafs, TreeModel model)
Definition: arbdbt.h:53
static GBL_command_definition groupXfer_command_table[]
GB_ERROR GB_await_error()
Definition: arb_msg.cxx:342
static GB_ERROR gxl_newsize(GBL_command_arguments *args)
GBDATA * GB_create_container(GBDATA *father, const char *key)
Definition: arbdb.cxx:1829
GB_ERROR GBT_overwrite_tree(GBDATA *gb_tree, TreeNode *tree)
Definition: adtree.cxx:526
GBDATA * get_gb_tree() const
Definition: ARB_Tree.hxx:82
bool shouldHaveBeenKeeled() const
Definition: NT_tree_cmp.h:99
bool is_leaf_set() const
bool aborted()
Definition: arb_progress.h:335
const char * get_group_name() const
static GB_ERROR gxl_ingroup(GBL_command_arguments *args)
long get_species_index(const char *species_name) const
GB_ERROR GBT_write_group_name(GBDATA *gb_group_name, const char *new_group_name, bool pedantic)
Definition: adtree.cxx:230
TSpecSet * find_best_matches_info(AP_tree *node, FILE *log, bool compare_node_info)
static void error(const char *msg)
Definition: mkptypes.cxx:96
RSpecSet * registerTree(AP_tree *node)
fputc('\n', stderr)
CONSTEXPR_INLINE_Cxx14 void swap(unsigned char &c1, unsigned char &c2)
Definition: ad_io_inline.h:19
GB_ERROR loadFromDB(const char *name) FINAL_OVERRIDE
Definition: AP_Tree.cxx:891
int get_unknown_members() const
GroupsToTransfer
Definition: NT_tree_cmp.h:27
int get_known_members() const
TSpecSet(AP_tree *nodei, const SpecSetRegistry &ssr, const char *species_name)
#define nt_assert(cond)
Definition: NT_local.h:27
GroupXfer_callenv(const GBL_env &env_, const char *name_, const GroupPenalty &match_, int oldGroupSize_, int newGroupSize_)
void setScorer(const GroupMatchScorer &newScorer)
int count_clades() const
Definition: TreeNode.cxx:799
void remove_remark()
Definition: TreeNode.h:326
GroupPenalty matchGroups(const TSpecSet &sourceSet, const RSpecSet &targetSet, long commonSpecies, long overallSpecies)
fputs(TRACE_PREFIX, stderr)
GB_ERROR GB_export_errorf(const char *templat,...)
Definition: arb_msg.cxx:262
bool is_leaf() const
Definition: TreeNode.h:211
TYPE * ARB_calloc(size_t nelem)
Definition: arb_mem.h:81
const GroupPenalty & custom_match(GBL_command_arguments *args)
GroupTransferMode
Definition: NT_tree_cmp.h:21
GB_ERROR linkToDB(int *zombies, int *duplicates) __ATTR__USERESULT
Definition: ARB_Tree.cxx:130
int known_members
char * name
Definition: TreeNode.h:174
void init(AP_tree *nodei, const SpecSetRegistry &ssr)
void subtitle(const char *stitle)
Definition: arb_progress.h:321
bool contains_marked_species()
Definition: ARB_Tree.cxx:198
char * GB_read_string(GBDATA *gbd)
Definition: arbdb.cxx:909
RSpecSet * search_best_match(const TSpecSet *tset, GroupPenalty &min_penalty)
SpecSetRegistry(long nspecies_, arb_progress *progress_, const GroupMatchScorer &scorer_)
Definition: NT_tree_cmp.cxx:23
#define IMPL_FORMATVALUE_CMD(args, fmt, value)
void set_remark(const char *newRemark)
Definition: TreeNode.h:320
void aw_message(const char *msg)
Definition: AW_status.cxx:1142
double calcUnknownMembersPenalty(const TSpecSet &sourceSet) const
double get_penalty() const
Definition: NT_tree_cmp.h:91
const GroupXfer_callenv & custom_gx_env(GBL_command_arguments *args)
#define NULp
Definition: cxxforward.h:116
RSpecSet(AP_tree *nodei, const SpecSetRegistry &ssr, const char *species_name)
static GB_ERROR gxl_groupname(GBL_command_arguments *args)
static GB_ERROR gxl_unknown(GBL_command_arguments *args)
SpecSet(AP_tree *nodei, const SpecSetRegistry &ssr, const char *species_name)
GB_ERROR check_validity() const
GB_ERROR write_node_information(FILE *log, bool delete_old_nodes, GroupsToTransfer what, const char *aci)
GB_ERROR NTREE_move_tree_info(GBDATA *gb_main, const char *tree_source, const char *tree_dest, const char *log_file, GroupTransferMode mode, GroupsToTransfer what, const GroupMatchScorer &scorer, const char *aci)
NOT4PERL char * GB_command_interpreter_in_env(const char *str, const char *commands, const GBL_call_env &callEnv)
Definition: gb_aci.cxx:361
bool isValid() const
Definition: NT_tree_cmp.h:45
unsigned char * allocate_bitstring() const
static GB_ERROR gxl_penalty(GBL_command_arguments *args)
GBDATA * gb_node
Definition: TreeNode.h:173
GBDATA * gb_main
Definition: adname.cxx:32
void mark_as_keeled()
Definition: NT_tree_cmp.h:103
int get_new_group_size() const
#define TEST_EXPECT_EQUAL(expr, want)
Definition: test_unit.h:1294
const GBL_command_lookup_table & ACI_get_standard_commands()
Definition: adlang1.cxx:2749
unsigned char * bitstring
long GBS_read_hash(const GB_HASH *hs, const char *key)
Definition: adhash.cxx:392
GBDATA * GB_entry(GBDATA *father, const char *key)
Definition: adquery.cxx:334
char * GBS_global_string_copy(const char *templat,...)
Definition: arb_msg.cxx:194
GB_HASH * GBS_create_hash(long estimated_elements, GB_CASE case_sens)
Definition: adhash.cxx:253
static GB_ERROR gxl_oldsize(GBL_command_arguments *args)