ARB
TreeRead.cxx
Go to the documentation of this file.
1 // ============================================================ //
2 // //
3 // File : TreeRead.cxx //
4 // Purpose : load tree from file //
5 // //
6 // Institute of Microbiology (Technical University Munich) //
7 // www.arb-home.de //
8 // //
9 // ============================================================ //
10 
11 #include "TreeRead.h"
12 #include <TreeNode.h>
13 
14 #include <arb_msg_fwd.h>
15 #include <arb_strbuf.h>
16 #include <arb_file.h>
17 #include <arb_defs.h>
18 #include <algorithm>
19 
20 #define tree_assert(cond) arb_assert(cond)
21 
22 /*!******************************************************************************************
23  load a tree from file system
24 ********************************************************************************************/
25 
26 #define MAX_DROPPED_GROUP_WARN 100
27 
28 // --------------------
29 // TreeReader
30 
31 class TreeReader : virtual Noncopyable {
32  enum tr_lfmode { LF_UNKNOWN, LF_N, LF_R, LF_NR, LF_RN, };
33 
34  int unnamed_counter;
35  char *tree_file_name;
36  FILE *in;
37  int last_character; // may be EOF
38  int line_cnt;
39 
40  GBS_strstruct tree_comment;
41  GBT_LEN max_found_branchlen;
42  double max_found_bootstrap;
43  tr_lfmode lfmode;
44 
45  char *warnings;
46 
47  TreeRoot *troot;
48 
49  struct Count {
50  int dropped_leaf_groups;
51  int dropped_duplicated_groups;
52  Count() : dropped_leaf_groups(0), dropped_duplicated_groups(0) {}
53  } count;
54 
55  void setError(const char *message);
56  void setErrorAt(const char *message);
57  void setExpectedError(const char *expected);
58 
59  int get_char();
60  int read_char();
61  int read_tree_char(); // extracts comments and ignores whitespace outside comments
62 
63  char *content_ahead(size_t how_many, bool show_eof);
64 
65  void drop_tree_char(char expected);
66 
67  void setBranchName_acceptingBootstrap(TreeNode *node, char*& name);
68 
69  // The eat-functions below assume that the "current" character
70  // has already been read into 'last_character':
71  void eat_white();
72  __ATTR__USERESULT bool eat_number(GBT_LEN& result);
73  char *eat_quoted_string();
74  bool eat_and_set_name_and_length(TreeNode *node, GBT_LEN& len);
75 
76  char *unnamedNodeName() { return GBS_global_string_copy("unnamed%i", ++unnamed_counter); }
77 
78  TreeNode *load_subtree(GBT_LEN& nodeLen);
79  TreeNode *load_named_node(GBT_LEN& nodeLen);
80 
81 public:
82 
83  TreeReader(FILE *input, const char *file_name, TreeRoot *troot_);
84  ~TreeReader();
85 
87  GBT_LEN rootNodeLen = DEFAULT_BRANCH_LENGTH_MARKER; // ignored dummy
88  TreeNode *tree = load_named_node(rootNodeLen);
89 
90  if (!error) {
91  if (rootNodeLen != DEFAULT_BRANCH_LENGTH_MARKER && rootNodeLen != 0.0) {
92  add_warning("Length specified for root-node has been ignored");
93  }
94 
95  // check for unexpected input
96  if (last_character == ';') read_tree_char(); // accepts ';'
97  if (last_character != EOF) {
98  char *unused_input = content_ahead(30, false);
99  add_warningf("Unexpected input-data after tree: '%s'", unused_input);
100  free(unused_input);
101  }
103  }
104  return tree;
105  }
106 
108 
109  void add_warning(const char *msg) {
110  if (warnings) freeset(warnings, GBS_global_string_copy("%s\n%s", warnings, msg));
111  else warnings = GBS_global_string_copy("Warning(s): %s", msg);
112  }
113  __ATTR__FORMAT(2) void add_warningf(const char *format, ...) { FORWARD_FORMATTED(add_warning, format); }
114 
115  GB_ERROR get_warnings() const { return warnings; } // valid until TreeReader is destroyed
116 
117  char *takeComment() {
118  // can only be called once (further calls will return NULp)
119  return tree_comment.release();
120  }
121 
122  double get_max_found_bootstrap() const { return max_found_bootstrap; }
123  GBT_LEN get_max_found_branchlen() const { return max_found_branchlen; }
124 };
125 
126 TreeReader::TreeReader(FILE *input, const char *file_name, TreeRoot *troot_)
127  : unnamed_counter(0),
128  tree_file_name(strdup(file_name)),
129  in(input),
130  last_character(0),
131  line_cnt(1),
132  tree_comment(2048),
133  max_found_branchlen(-1),
134  max_found_bootstrap(-1),
135  lfmode(LF_UNKNOWN),
136  warnings(NULp),
137  troot(troot_),
138  error(NULp)
139 {
140  read_tree_char();
141 }
142 
144  free(warnings);
145  free(tree_file_name);
146 }
147 
148 void TreeReader::setError(const char *message) {
149  tree_assert(!error);
150  error = GBS_global_string("Error reading %s:%i: %s",
151  tree_file_name, line_cnt, message);
152 }
153 char *TreeReader::content_ahead(size_t how_many, bool show_eof) {
154  char show[how_many+1+4]; // 4 = oversize of '<EOF>'
155  size_t i;
156  for (i = 0; i<how_many; ++i) {
157  show[i] = last_character;
158  if (show[i] == EOF) {
159  if (show_eof) {
160  strcpy(show+i, "<EOF>");
161  i += 5;
162  }
163  break;
164  }
165  read_char();
166  }
167  show[i] = 0;
168  return strdup(show);
169 }
170 
171 void TreeReader::setErrorAt(const char *message) {
172  if (last_character == EOF) {
173  setError(GBS_global_string("%s while end-of-file was reached", message));
174  }
175  else {
176  char *show = content_ahead(30, true);
177  setError(GBS_global_string("%s while looking at '%s'", message, show));
178  free(show);
179  }
180 }
181 
182 void TreeReader::setExpectedError(const char *expected) {
183  setErrorAt(GBS_global_string("Expected %s", expected));
184 }
185 
186 int TreeReader::get_char() {
187  // reads character from stream
188  // - converts linefeeds for DOS- and MAC-textfiles
189  // - increments line_cnt
190 
191  int c = getc(in);
192  int inc = 0;
193 
194  if (c == '\n') {
195  switch (lfmode) {
196  case LF_UNKNOWN: lfmode = LF_N; inc = 1; break;
197  case LF_N: inc = 1; break;
198  case LF_R: lfmode = LF_RN; c = get_char(); break;
199  case LF_NR: c = get_char(); break;
200  case LF_RN: inc = 1; break;
201  }
202  }
203  else if (c == '\r') {
204  switch (lfmode) {
205  case LF_UNKNOWN: lfmode = LF_R; inc = 1; break;
206  case LF_R: inc = 1; break;
207  case LF_N: lfmode = LF_NR; c = get_char(); break;
208  case LF_RN: c = get_char(); break;
209  case LF_NR: inc = 1; break;
210  }
211  if (c == '\r') c = '\n'; // never report '\r'
212  }
213  if (inc) line_cnt++;
214 
215  return c;
216 }
217 
218 int TreeReader::read_tree_char() {
219  // reads over tree comment(s) and whitespace.
220  // tree comments are stored inside TreeReader
221 
222  bool done = false;
223  int c = ' ';
224 
225  while (!done && !error) {
226  c = get_char();
227  if (c == ' ' || c == '\t' || c == '\n') ; // skip
228  else if (c == '[') { // collect tree comment(s)
229  int openBrackets = 1;
230  if (tree_comment.get_position()) {
231  tree_comment.put('\n'); // not first comment -> add new line
232  }
233 
234  while (openBrackets && !error) {
235  c = get_char();
236  switch (c) {
237  case EOF:
238  setError("Reached end of file while reading comment");
239  break;
240  case ']':
241  openBrackets--;
242  if (openBrackets) tree_comment.put(c); // write all but last closing brackets
243  break;
244  case '[':
245  openBrackets++;
246  // fall-through
247  default:
248  tree_comment.put(c);
249  break;
250  }
251  }
252  }
253  else done = true;
254  }
255 
256  last_character = c;
257  return c;
258 }
259 
260 int TreeReader::read_char() {
261  int c = get_char();
262  last_character = c;
263  return c;
264 }
265 
266 void TreeReader::eat_white() {
267  int c = last_character;
268  while ((c == ' ') || (c == '\n') || (c == '\r') || (c == '\t')) {
269  c = read_char();
270  }
271 }
272 
273 bool TreeReader::eat_number(GBT_LEN& result) {
274  char strng[256];
275  char *s = strng;
276  int c = last_character;
277 
278  while (((c<='9') && (c>='0')) || (c=='.') || (c=='-') || (c=='+') || (c=='e') || (c=='E')) {
279  *(s++) = c;
280  c = read_char();
281  }
282  *s = 0;
283  result = GB_atof(strng);
284  eat_white();
285 
286  bool consumed_some_length = strng[0];
287  return consumed_some_length;
288 }
289 
290 char *TreeReader::eat_quoted_string() {
300  const int MAX_NAME_LEN = 1000;
301 
302  char buffer[MAX_NAME_LEN+2];
303  char *s = buffer;
304  int c = last_character;
305 
306 #define NAME_TOO_LONG ((s-buffer)>MAX_NAME_LEN)
307 
308  if (c == '\'' || c == '"') {
309  char found_quote = c;
310 
311  c = read_char();
312  while (c!=EOF && c!=found_quote) {
313  *(s++) = c;
314  if (NAME_TOO_LONG) { c = 0; break; }
315  c = read_char();
316  }
317  if (c == found_quote) c = read_tree_char();
318  }
319  else {
320 #if 0
321  // previous behavior: skip prefixes matching PRE '_* *'
322  // (reason unknown; behavior exists since [2])
323  // conflicts with replacement of problematic character done in ../TREE_WRITE/TreeWrite.cxx@replace_by_underscore
324  // -> disabled
325  while (c == '_') c = read_tree_char();
326  while (c == ' ') c = read_tree_char();
327 #endif
328  while (c!=':' && c!=EOF && c!=',' && c!=';' && c != ')') {
329  *(s++) = c;
330  if (NAME_TOO_LONG) break;
331  c = read_tree_char();
332  }
333  }
334  *s = 0;
335  if (NAME_TOO_LONG) {
336  setError(GBS_global_string("Name '%s' is longer than %i bytes", buffer, MAX_NAME_LEN));
337  return NULp;
338  }
339  return strdup(buffer);
340 }
341 
342 void TreeReader::setBranchName_acceptingBootstrap(TreeNode *node, char*& name) {
343  // store groupname and/or bootstrap value.
344  //
345  // ARBs extended newick format allows 3 kinds of node-names:
346  // 'groupname'
347  // 'bootstrap'
348  // 'bootstrap:groupname' (needs to be quoted)
349  //
350  // where
351  // 'bootstrap' is sth interpretable as double (optionally followed by '%')
352  // 'groupname' is sth not interpretable as double
353  //
354  // If a groupname is detected, it is stored in node->name
355  // If a bootstrap is detected, it is stored in node->remark_branch
356  //
357  // Bootstrap values will be scaled up by factor 100.
358  // Wrong scale-ups (to 10000) will be corrected by calling TREE_scale() after the whole tree has been loaded.
359 
360  char *new_name = NULp;
361  {
362  double bootstrap;
363  const char *label = name;
364  bool is_bootstrap = parse_treelabel(label, bootstrap);
365 
366  if (is_bootstrap) {
367  bootstrap = bootstrap*100.0; // needed if bootstrap values are between 0.0 and 1.0 (downscaling is done later)
368  if (bootstrap > max_found_bootstrap) { max_found_bootstrap = bootstrap; }
369 
370  if (node->get_remark()) {
371  error = "Invalid duplicated bootstrap specification detected";
372  }
373  else {
374  node->set_bootstrap(bootstrap);
375  }
376 
377  if (label) new_name = strdup(label);
378  freenull(name);
379  }
380  else {
381  reassign(new_name, name); // use whole input as groupname
382  }
383  }
384 
385  if (new_name) {
386  if (node->name) {
387  if (node->is_leaf()) {
388  if (count.dropped_leaf_groups<MAX_DROPPED_GROUP_WARN) {
389  add_warningf("Dropped group name specified for a single-node-subtree ('%s')", new_name);
390  if (++count.dropped_leaf_groups == MAX_DROPPED_GROUP_WARN) {
391  add_warning("[Note: further warnings of this type will be suppressed]");
392  }
393  }
394  freenull(new_name);
395  }
396  else {
397  if (count.dropped_duplicated_groups<MAX_DROPPED_GROUP_WARN) {
398  add_warningf("Duplicated group name specification detected: dropped inner ('%s'), kept outer group name ('%s')",
399  node->name, new_name);
400  if (++count.dropped_duplicated_groups == MAX_DROPPED_GROUP_WARN) {
401  add_warning("[Note: further warnings of this type will be suppressed]");
402  }
403  }
404  freeset(node->name, new_name);
405  }
406  }
407  else {
408  node->name = new_name;
409  }
410  }
411 }
412 
413 void TreeReader::drop_tree_char(char expected) {
414  if (last_character != expected) {
415  setExpectedError(GBS_global_string("'%c'", expected));
416  }
417  read_tree_char();
418 }
419 
420 bool TreeReader::eat_and_set_name_and_length(TreeNode *node, GBT_LEN& nodeLen) {
421  // reads optional branch-length and -name
422  //
423  // if 'nodeLen' contains DEFAULT_BRANCH_LENGTH_MARKER, it gets overwritten with any found length-specification
424  // otherwise found length is added to 'nodeLen'
425  //
426  // sets the branch-name of 'node', if a name is found (e.g. sth like "(...)'name':0.5")
427  //
428  // returns true if successful, false otherwise (TreeReader::error is set then)
429 
430  bool done = false;
431  bool length_consumed = false;
432 
433  while (!done && !error) {
434  switch (last_character) {
435  case ';':
436  case ',':
437  case ')':
438  done = true;
439  break;
440  case ':':
441  if (!error && length_consumed) setErrorAt("Unexpected ':' (already read a branchlength)");
442  if (!error) drop_tree_char(':');
443  if (!error) {
444  GBT_LEN foundlen;
445  if (eat_number(foundlen)) {
446  if (is_marked_as_default_len(nodeLen)) {
447  nodeLen = foundlen;
448  }
449  else {
450  tree_assert(node->is_leaf()); // should only happen when a single leaf in parenthesis was read
451  nodeLen += foundlen; // sum leaf and node lengths
452  }
453  max_found_branchlen = std::max(max_found_branchlen, nodeLen);
454  }
455  else {
456  setExpectedError("valid length");
457  }
458  }
459  length_consumed = true;
460  break;
461 
462  case EOF:
463  done = true;
464  break;
465 
466  default: {
467  char *branchName = eat_quoted_string();
468  if (branchName) {
469  if (branchName[0]) setBranchName_acceptingBootstrap(node, branchName);
470  }
471  else {
472  UNCOVERED();
473  setExpectedError("branch-name or one of ':;,)'");
474  }
475  break;
476  }
477  }
478  }
479 
480  return !error;
481 }
482 
483 static TreeNode *createLinkedTreeNode(const TreeRoot& nodeMaker, TreeNode *left, GBT_LEN leftlen, TreeNode *right, GBT_LEN rightlen) { // @@@ move into class GBT_tree (as ctor) - or better move into TreeNodeFactory
484  TreeNode *node = nodeMaker.makeNode();
485 
486  node->leftson = left;
487  node->leftlen = leftlen;
488  node->rightson = right;
489  node->rightlen = rightlen;
490 
491  left->father = node;
492  right->father = node;
493 
494  return node;
495 }
496 
497 TreeNode *TreeReader::load_named_node(GBT_LEN& nodeLen) {
498  // reads a node or subtree.
499  // a single node is expected to have a name (or will be auto-named)
500  // subtrees may have a name (groupname)
501  TreeNode *node = NULp;
502 
503  if (last_character == '(') {
504  node = load_subtree(nodeLen);
505  }
506  else { // single node
507  eat_white();
508  char *name = eat_quoted_string();
509  if (name) {
510  if (!name[0]) freeset(name, unnamedNodeName());
511 
512  node = troot->makeNode();
513  node->name = name;
514  node->markAsLeaf();
515  }
516  else {
517  UNCOVERED();
518  setExpectedError("(quoted) string");
519  }
520  }
521  if (node && !error) {
522  if (!eat_and_set_name_and_length(node, nodeLen)) {
523  node->forget_origin();
524  destroy(node, troot);
525  node = NULp;
526  }
527  }
528  tree_assert(contradicted(node, error));
529  tree_assert(!node || !node->is_leaf() || node->name); // leafs need to be named here
530  return node;
531 }
532 
533 
534 TreeNode *TreeReader::load_subtree(GBT_LEN& nodeLen) {
535  // loads a subtree (i.e. expects parenthesis around one or several nodes)
536  //
537  // 'nodeLen' normally is set to DEFAULT_BRANCH_LENGTH_MARKER
538  // or to length of single node (if parenthesis contain only one node)
539  //
540  // length and/or name behind '(...)' are not parsed (has to be done by caller).
541  //
542  // if subtree contains a single node (or a single other subtree), 'name'+'remark_branch' are
543  // already set, when load_subtree() returns - otherwise they are NULp.
544 
545  TreeNode *node = NULp;
546 
547  drop_tree_char('(');
548 
550  TreeNode *left = load_named_node(leftLen);
551 
552  if (left) {
553  switch (last_character) {
554  case ')': // single node
555  nodeLen = leftLen;
556  node = left;
557  left = NULp;
558  break;
559 
560  case ',': {
562  TreeNode *right = NULp;
563 
564  while (last_character == ',' && !error) {
565  if (right) { // multi-branch
566  TreeNode *pair = createLinkedTreeNode(*troot, left, leftLen, right, rightLen);
567 
568  left = pair; leftLen = 0;
569  right = NULp; rightLen = DEFAULT_BRANCH_LENGTH_MARKER;
570  }
571 
572  drop_tree_char(',');
573  if (!error) {
574  right = load_named_node(rightLen);
575  }
576  }
577 
578  if (!error) {
579  if (last_character == ')') {
580  node = createLinkedTreeNode(*troot, left, leftLen, right, rightLen);
582 
583  left = NULp;
584  right = NULp;
585  }
586  else {
587  setExpectedError("one of ',)'");
588  }
589  }
590 
591  if (right) {
592  right->forget_origin();
593  destroy(right, troot);
594  }
595  if (error && node) {
596  node->forget_origin();
597  destroy(node, troot);
598  node = NULp;
599  }
600 
601  break;
602  }
603 
604  default:
605  setExpectedError("one of ',)'");
606  break;
607  }
608  if (left) {
609  left->forget_origin();
610  destroy(left, troot);
611  }
612  }
613 
614  if (!error) drop_tree_char(')');
615 
616  tree_assert(contradicted(node, error));
617  return node;
618 }
619 
620 TreeNode *TREE_load(const char *path, TreeRoot *troot, char **commentPtr, bool allow_length_scaling, char **warningPtr) {
621  /* Load a newick compatible tree from file 'path',
622  if commentPtr is specified -> set it to a malloc copy of all concatenated comments found in tree file
623  if warningPtr is specified -> set it to a malloc copy of any warnings occurring during tree-load (e.g. autoscale- or informational warnings)
624  */
625 
626  TreeNode *tree = NULp;
627  FILE *input = fopen(path, "rt");
628  GB_ERROR error = NULp;
629  bool own_root = true;
630 
631  if (!input) {
632  error = GBS_global_string("No such file: %s", path);
633  }
634  else {
635  const char *name_only = strrchr(path, '/');
636  if (name_only) ++name_only;
637  else name_only = path;
638 
639  TreeReader reader(input, name_only, troot);
640  if (!reader.error) {
641  tree = reader.load();
642  if (tree) own_root = false;
643  }
644  fclose(input);
645 
646  if (reader.error) error = reader.error;
647  else if (tree && tree->is_leaf()) error = "tree is too small (need at least 2 species)";
648 
649  if (error) {
650  destroy(tree);
651  tree = NULp;
652  }
653 
654  if (tree) {
655  double bootstrap_scale = 1.0;
656  double branchlen_scale = 1.0;
657 
658  if (reader.get_max_found_bootstrap() >= 101.0) { // bootstrap values were given in percent
659  bootstrap_scale = 0.01;
660  reader.add_warningf("Auto-scaling bootstrap values by factor %.2f (max. found bootstrap was %5.2f)",
661  bootstrap_scale, reader.get_max_found_bootstrap());
662  }
663  if (reader.get_max_found_branchlen() >= 1.1) { // assume branchlengths have range [0;100]
664  if (allow_length_scaling) {
665  branchlen_scale = 0.01;
666  reader.add_warningf("Auto-scaling branchlengths by factor %.2f (max. found branchlength = %.2f)\n"
667  "(use ARB_NT/Tree/Modify branches/Scale branchlengths with factor %.2f to undo auto-scaling)",
668  branchlen_scale, reader.get_max_found_branchlen(), 1.0/branchlen_scale);
669  }
670  }
671 
672  TREE_scale(tree, branchlen_scale, bootstrap_scale); // scale bootstraps and branchlengths
673 
674  if (warningPtr) {
675  const char *wmsg = reader.get_warnings();
676  if (wmsg) *warningPtr = strdup(wmsg);
677  }
678 
679  if (commentPtr) {
680  char *comment = reader.takeComment();
681 
682  const char *loaded_from = GBS_global_string("Loaded from %s", path);
683  freeset(comment, GBS_log_action_to(comment, loaded_from, true));
684 
685  tree_assert(!*commentPtr);
686  *commentPtr = comment;
687  }
688  }
689  }
690 
691  tree_assert(tree||error);
692  if (error) {
693  GB_export_errorf("Import tree: %s", error);
694  tree_assert(!tree);
695  if (own_root) troot->delete_by_node();
696  }
697 
698  return tree;
699 }
700 
701 GB_ERROR TREE_load_to_db(GBDATA *gb_main, const char *treefile, const char *tree_name) {
702  GB_ERROR error = NULp;
703 
704  char *warnings = NULp;
705  char *tree_comment = NULp;
706 
707  TreeNode *tree = TREE_load(treefile, new SimpleRoot, &tree_comment, true, &warnings);
708 
709  if (!tree) error = GB_await_error();
710  else {
711  if (warnings) GBT_message(gb_main, warnings);
712 
713  {
714  GB_transaction ta(gb_main);
715  error = GBT_write_tree_with_remark(gb_main, tree_name, tree, tree_comment);
716  error = ta.close(error);
717  }
718 
719  destroy(tree);
720  }
721 
722  free(warnings);
723  free(tree_comment);
724 
725  return error;
726 }
727 
728 // --------------------------------------------------------------------------------
729 
730 #ifdef UNIT_TESTS
731 #ifndef TEST_UNIT_H
732 #include <test_unit.h>
733 #endif
734 
735 static TreeNode *loadFromFileContaining(const char *treeString, char **warningsPtr) {
736  const char *filename = "trees/tmp.tree";
737  FILE *out = fopen(filename, "wt");
738  TreeNode *tree = NULp;
739 
740  if (out) {
741  fputs(treeString, out);
742  fclose(out);
743  tree = TREE_load(filename, new SimpleRoot, NULp, false, warningsPtr);
744  }
745  else {
746  GB_export_IO_error("save tree", filename);
747  }
748 
749  return tree;
750 }
751 
752 static arb_test::match_expectation loading_tree_failed_with(TreeNode *tree, const char *errpart) {
753  using namespace arb_test;
754  expectation_group expected;
755 
756  expected.add(that(tree).is_equal_to_NULL());
757  expected.add(that(GB_have_error()).is_equal_to(true));
758  if (GB_have_error()) {
759  expected.add(that(GB_await_error()).does_contain(errpart));
760  }
761  return all().ofgroup(expected);
762 }
763 
764 static arb_test::match_expectation loading_tree_succeeds(TreeNode *tree, const char *newick_expected) {
765  using namespace arb_test;
766  expectation_group expected;
767 
768  expected.add(that(tree).does_differ_from_NULL());
769  expected.add(that(GB_get_error()).is_equal_to_NULL());
770  if (!GB_have_error() && tree) {
771  char *newick = GBT_tree_2_newick(tree, nSIMPLE, false);
772  expected.add(that(newick).is_equal_to(newick_expected));
773  free(newick);
774  }
775  return all().ofgroup(expected);
776 }
777 
778 #define TEST_EXPECT_TREELOAD_FAILED_WITH(tree,errpart) TEST_EXPECTATION(loading_tree_failed_with(tree, errpart))
779 #define TEST_EXPECT_TREELOAD_FAILED_WITH__BROKEN(tree,errpart) TEST_EXPECTATION__BROKEN(loading_tree_failed_with(tree, errpart))
780 
781 #define TEST_EXPECT_TREELOAD(tree,newick) TEST_EXPECTATION(loading_tree_succeeds(tree,newick))
782 #define TEST_EXPECT_TREELOAD__BROKEN(tree,newick) TEST_EXPECTATION__BROKEN(loading_tree_succeeds(tree,newick))
783 
784 #define TEST_EXPECT_TREEFILE_FAILS_WITH(name,errpart) do { \
785  TreeNode *tree = TREE_load(name, new SimpleRoot, NULp, false, NULp); \
786  TEST_EXPECT_TREELOAD_FAILED_WITH(tree, errpart); \
787  } while(0)
788 
789 #define TEST_EXPECT_TREESTRING_FAILS_WITH(treeString,errpart) do { \
790  TreeNode *tree = loadFromFileContaining(treeString, NULp); \
791  TEST_EXPECT_TREELOAD_FAILED_WITH(tree, errpart); \
792  } while(0)
793 
794 // argument 'newick' is vs regression only!
795 #define TEST_EXPECT_TREESTRING_FAILS_WITH__BROKEN(treeString,errpart,newick) do { \
796  char *warnings = NULp; \
797  TreeNode *tree = loadFromFileContaining(treeString, &warnings); \
798  TEST_EXPECT_TREELOAD_FAILED_WITH__BROKEN(tree, errpart); \
799  TEST_EXPECT_TREELOAD(tree, newick); \
800  TEST_EXPECT_NULL(warnings); \
801  delete tree; \
802  free(warnings); \
803  } while(0)
804 
805 #define TEST_EXPECT_TREESTRING_OK(treeString,newick) do { \
806  char *warnings = NULp; \
807  TreeNode *tree = loadFromFileContaining(treeString, &warnings); \
808  TEST_EXPECT_TREELOAD(tree, newick); \
809  TEST_EXPECT_NULL(warnings); \
810  destroy(tree); \
811  free(warnings); \
812  } while(0)
813 
814 #define TEST_EXPECT_TREESTRING_OK_WITH_WARNING(treeString,newick,warnPart) do { \
815  char *warnings = NULp; \
816  TreeNode *tree = loadFromFileContaining(treeString, &warnings); \
817  TEST_EXPECT_TREELOAD(tree, newick); \
818  TEST_REJECT_NULL(warnings); \
819  TEST_EXPECT_CONTAINS(warnings, warnPart); \
820  destroy(tree); \
821  free(warnings); \
822  } while(0)
823 
824 #define TEST_EXPECT_TREESTRING_OK__BROKEN(treeString,newick) do { \
825  TreeNode *tree = loadFromFileContaining(treeString, NULp); \
826  TEST_EXPECT_TREELOAD__BROKEN(tree, newick); \
827  } while(0)
828 
829 void TEST_load_tree() {
830  // just are few tests covering most of this module.
831  // more load tests are in ../../TOOLS/arb_test.cxx@TEST_SLOW_arb_read_tree
832 
833  // simple succeeding tree load
834  {
835  char *comment = NULp;
836  TreeNode *tree = TREE_load("trees/test.tree", new SimpleRoot, &comment, false, NULp);
837  // -> ../../UNIT_TESTER/run/trees/test.tree
838 
839  TEST_EXPECT_TREELOAD(tree, "(((s1,s2),(s3,s 4)),(s5,s-6));");
840  if (tree) {
841  TEST_REJECT_NULL(comment);
842  TEST_EXPECT_CONTAINS(comment,
843  // comment part from treefile:
844  "tree covering most of tree reader code\n"
845  "comment contains [extra brackets] inside comment\n");
846  TEST_EXPECT_CONTAINS(comment,
847  // comment as appended by load:
848  ": Loaded from trees/test.tree\n");
849  }
850  free(comment);
851  destroy(tree);
852  }
853 
854  // detailed load tests (checking branchlengths and nodenames)
855  {
856  const char *treestring[] = {
857  "(node1,node2)rootgroup;", // [0] tree with a named root
858  "(node1:0.00,(node2, node3:0.57)):0;", // [1] test tree lengths (esp. length zero)
859  "(((((a))single)), ((b, c)17%:0.2));", // [2] test single-node-subtree name-conflict
860 
861  "((a,b)17,(c,d)33.3,(e,f)12.5:0.2);", // [3] test bootstraps
862  "((a,b)G,(c,d)H,(e,f)I:0.2);", // [4] test groupnames w/o bootstraps
863  "((a,b)'17:G',(c,d)'33.3:H',(e,f)'12.5:I':0.2);", // [5] test groupnames with bootstraps
864  "((a,b)17G,(c,d)33.3H,(e,f)12.5I:0.2)", // [6] test groupnames + bootstraps w/o separator
865 
866  "((a,b)'17%:G',(c,d)'33.3%:H',(e,f)'12.5%:I':0.2);", // [7] test bootstraps with percent spec
867  "((a,b)'0.17:G',(c,d)'0.333:H',(e,f)'0.125:I':0.2);", // [8] test bootstraps in range [0..1]
868  };
869 
870  const char *expected_newick[] = {
871  "(node1,node2);",
872  "(node1,(node2,node3));",
873  "(a,(b,c));",
874 
875  "(((a,b),(c,d)),(e,f));",
876  "(((a,b),(c,d)),(e,f));",
877  "(((a,b),(c,d)),(e,f));",
878  "(((a,b),(c,d)),(e,f));",
879 
880  "(((a,b),(c,d)),(e,f));",
881  "(((a,b),(c,d)),(e,f));",
882  };
883  const char *expected_warnings[] = {
884  NULp,
885  NULp,
886  "Dropped group name specified for a single-node-subtree",
887 
888  "Auto-scaling bootstrap values by factor 0.01",
889  NULp,
890  "Auto-scaling bootstrap values by factor 0.01",
891  NULp,
892  NULp, // no auto-scaling shall occur here (bootstraps are already specified as percent)
893  NULp, // no auto-scaling shall occur here (bootstraps are in [0..1])
894  };
895 
896  STATIC_ASSERT(ARRAY_ELEMS(expected_newick) == ARRAY_ELEMS(treestring));
897  STATIC_ASSERT(ARRAY_ELEMS(expected_warnings) == ARRAY_ELEMS(treestring));
898 
899  for (size_t i = 0; i<ARRAY_ELEMS(treestring); ++i) {
900  TEST_ANNOTATE(GBS_global_string("for tree #%zu = '%s'", i, treestring[i]));
901  char *warnings = NULp;
902  TreeNode *tree = loadFromFileContaining(treestring[i], &warnings);
903  TEST_EXPECT_TREELOAD(tree, expected_newick[i]);
904  switch (i) {
905  case 0:
906  TEST_EXPECT_EQUAL(tree->name, "rootgroup");
907  break;
908  case 1:
909  TEST_EXPECT_EQUAL(tree->leftlen, 0);
911  TEST_EXPECT_EQUAL(tree->rightson->rightlen, 0.57);
912  break;
913  case 2:
914  // test bootstrap with percent-specification is parsed correctly
916  TEST_EXPECT_EQUAL(tree->rightson->get_remark(), "17%");
917  TEST_EXPECT_EQUAL(tree->rightlen, 0.2);
918  break;
919 
920  case 3:
921  case 4:
922  case 5:
923  case 6:
924  case 7:
925  case 8:
926  // check bootstraps
928  switch (i) {
929  case 4:
930  case 6:
934  break;
935  case 3:
936  case 5:
937  case 7:
938  case 8:
939  TEST_EXPECT_EQUAL(tree->leftson->leftson->get_remark(), "17%");
940  TEST_EXPECT_EQUAL(tree->leftson->rightson->get_remark(), "33%");
941  TEST_EXPECT_EQUAL(tree->rightson->get_remark(), "13%");
942  break;
943  default:
944  TEST_REJECT(true); // unhandled tree
945  break;
946  }
947 
948  // check node-names
949  TEST_EXPECT_NULL(tree->name);
950  TEST_EXPECT_NULL(tree->leftson->name);
951  switch (i) {
952  case 6:
953  // check un-separated digits are treated as strange names
954  // (previously these were accepted as bootstraps)
955  TEST_EXPECT_EQUAL(tree->leftson->leftson->name, "17G");
956  TEST_EXPECT_EQUAL(tree->leftson->rightson->name, "33.3H");
957  TEST_EXPECT_EQUAL(tree->rightson->name, "12.5I");
958  break;
959  case 4:
960  case 5:
961  case 8:
962  case 7:
963  TEST_EXPECT_EQUAL(tree->leftson->leftson->name, "G");
964  TEST_EXPECT_EQUAL(tree->leftson->rightson->name, "H");
965  TEST_EXPECT_EQUAL(tree->rightson->name, "I");
966  break;
967  case 3:
971  break;
972  default:
973  TEST_REJECT(true); // unhandled tree
974  break;
975  }
976 
977  // expect_no_lengths:
978  TEST_EXPECT_EQUAL(tree->leftlen, 0); // multifurcation
981  TEST_EXPECT_EQUAL(tree->rightlen, 0.2);
982  break;
983 
984  default:
985  TEST_REJECT(true); // unhandled tree
986  break;
987  }
988  if (expected_warnings[i]) {
989  TEST_REJECT_NULL(warnings);
990  TEST_EXPECT_CONTAINS(warnings, expected_warnings[i]);
991  }
992  else {
993  TEST_EXPECT_NULL(warnings);
994  }
995  free(warnings);
996  destroy(tree);
997  }
998 
999  TEST_ANNOTATE(NULp);
1000  }
1001 
1002  // test valid trees with strange or wrong behavior
1003  TEST_EXPECT_TREESTRING_OK("(,);", "(unnamed1,unnamed2);"); // tree with 2 unamed species (weird, but ok)
1004  TEST_EXPECT_TREESTRING_OK("( a, (b,(c),d), (e,(f)) );", "((a,((b,c),d)),(e,f));");
1005  TEST_EXPECT_TREESTRING_OK("(((((a)))), ((b, c)));", "(a,(b,c));");
1006 
1007  TEST_EXPECT_TREESTRING_OK_WITH_WARNING("( (a), (((b),(c),(d))group)dupgroup, ((e),(f)) );",
1008  "((a,((b,c),d)),(e,f));",
1009  "Duplicated group name specification detected");
1010 
1011  // test unacceptable trees
1012  {
1013  const char *tooSmallTree[] = {
1014  "();",
1015  "()",
1016  ";",
1017  "",
1018  "(one)",
1019  "((((()))));",
1020  "(((((one)))));",
1021  };
1022 
1023  for (size_t i = 0; i<ARRAY_ELEMS(tooSmallTree); ++i) {
1024  TEST_ANNOTATE(GBS_global_string("for tree #%zu = '%s'", i, tooSmallTree[i]));
1025  TreeNode *tree = loadFromFileContaining(tooSmallTree[i], NULp);
1026  TEST_EXPECT_TREELOAD_FAILED_WITH(tree, "tree is too small");
1027  }
1028  TEST_ANNOTATE(NULp);
1029  }
1030  {
1031  TreeNode *tree = loadFromFileContaining("((a, b)25)20;", NULp);
1032  TEST_EXPECT_TREELOAD_FAILED_WITH(tree, "Invalid duplicated bootstrap specification detected");
1033  }
1034 
1035  // test invalid trees
1036  TEST_EXPECT_TREESTRING_FAILS_WITH("(;);", "Expected one of ',)'");
1037 
1038  TEST_EXPECT_TREESTRING_FAILS_WITH("(17", "Expected one of ',)' while end-of-file was reached");
1039  TEST_EXPECT_TREESTRING_FAILS_WITH("((((", "Expected one of ',)' while end-of-file was reached");
1040  TEST_EXPECT_TREESTRING_FAILS_WITH("(a, 'b", "Expected one of ',)' while end-of-file was reached");
1041 
1042  TEST_EXPECT_TREESTRING_FAILS_WITH("(a, b:5::::", "Unexpected ':' (already read a branchlength) while looking at '::::<EOF>'");
1043  TEST_EXPECT_TREESTRING_FAILS_WITH("(a, b:5:c:d", "Unexpected ':' (already read a branchlength) while looking at ':c:d<EOF>'");
1044  TEST_EXPECT_TREESTRING_FAILS_WITH("(a, b:5:c:d)", "Unexpected ':' (already read a branchlength) while looking at ':c:d)<EOF>'");
1045 
1046  TEST_EXPECT_TREESTRING_FAILS_WITH("[unclosed\ncomment", "while reading comment");
1047  TEST_EXPECT_TREESTRING_FAILS_WITH("[unclosed\ncomment [ bla ]", "while reading comment");
1048 
1049  TEST_EXPECT_TREESTRING_FAILS_WITH("(a, b:d)", "Expected valid length while looking at 'd)<EOF>'");
1050 
1051  // questionable accepted trees / check warnings
1052  TEST_EXPECT_TREESTRING_OK_WITH_WARNING("(a,b):0.5", "(a,b);", "Length specified for root-node has been ignored");
1053  TEST_EXPECT_TREESTRING_OK_WITH_WARNING("(a, b))", "(a,b);", "Unexpected input-data after tree: ')'");
1054 
1055  TEST_EXPECT_TREESTRING_OK("(a*,b%);", "(a*,b%);"); // @@@ really accept such names?
1056  TEST_EXPECT_TREESTRING_OK("(a, b:5)", "(a,b);");
1057 
1058  // check errors
1059  TEST_EXPECT_TREEFILE_FAILS_WITH("trees/nosuch.tree", "No such file");
1060  TEST_EXPECT_TREEFILE_FAILS_WITH("trees/corrupted.tree", "Error reading");
1061 
1062  TEST_EXPECT_ZERO_OR_SHOW_ERRNO(GB_unlink("trees/tmp.tree")); // cleanup
1063 }
1064 
1065 #endif // UNIT_TESTS
1066 
1067 // --------------------------------------------------------------------------------
GB_ERROR GB_get_error()
Definition: arb_msg.cxx:333
GB_ERROR get_warnings() const
Definition: TreeRead.cxx:115
void set_bootstrap(double bootstrap)
Definition: TreeNode.h:323
const char * GB_ERROR
Definition: arb_core.h:25
string result
#define MAX_NAME_LEN
group_matcher all()
Definition: test_unit.h:1011
AliDataPtr format(AliDataPtr data, const size_t wanted_len, GB_ERROR &error)
Definition: insdel.cxx:615
GB_ERROR GBT_write_tree_with_remark(GBDATA *gb_main, const char *tree_name, TreeNode *tree, const char *remark)
Definition: adtree.cxx:570
void TREE_scale(TreeNode *tree, double length_scale, double bootstrap_scale)
Definition: TreeTools.cxx:14
char * takeComment()
Definition: TreeRead.cxx:117
char * GBT_tree_2_newick(const TreeNode *tree, NewickFormat format, bool compact)
Definition: adtree.cxx:1412
GB_ERROR error
Definition: TreeRead.cxx:107
void forget_origin()
Definition: TreeNode.h:414
bool is_marked_as_default_len(GBT_LEN len)
Definition: TreeRead.h:20
GB_ERROR TREE_load_to_db(GBDATA *gb_main, const char *treefile, const char *tree_name)
Definition: TreeRead.cxx:701
GB_ERROR GB_export_IO_error(const char *action, const char *filename)
Definition: arb_msg.cxx:318
#define DEFAULT_BRANCH_LENGTH
Definition: arbdbt.h:18
const char * GBS_global_string(const char *templat,...)
Definition: arb_msg.cxx:203
#define FORWARD_FORMATTED(receiver, format)
Definition: arb_msg_fwd.h:19
bool GB_have_error()
Definition: arb_msg.cxx:338
char * release()
Definition: arb_strbuf.h:129
int GB_unlink(const char *path)
Definition: arb_file.cxx:188
#define ARRAY_ELEMS(array)
Definition: arb_defs.h:19
char buffer[MESSAGE_BUFFERSIZE]
Definition: seq_search.cxx:34
GBT_LEN leftlen
Definition: TreeNode.h:172
TreeNode * rightson
Definition: TreeNode.h:171
double get_max_found_bootstrap() const
Definition: TreeRead.cxx:122
__ATTR__FORMAT(2) void add_warningf(const char *format
#define DEFAULT_BRANCH_LENGTH_MARKER
Definition: TreeRead.h:19
#define TEST_EXPECT_CONTAINS(str, part)
Definition: test_unit.h:1316
GB_ERROR GB_await_error()
Definition: arb_msg.cxx:342
#define tree_assert(cond)
Definition: TreeRead.cxx:20
#define is_equal_to_NULL()
Definition: test_unit.h:1028
virtual TreeNode * makeNode() const =0
bool parse_treelabel(const char *&label, double &bootstrap)
Definition: TreeNode.h:129
TreeReader(FILE *input, const char *file_name, TreeRoot *troot_)
Definition: TreeRead.cxx:126
void message(char *errortext)
#define TEST_REJECT(cond)
Definition: test_unit.h:1330
#define TEST_REJECT_NULL(n)
Definition: test_unit.h:1325
TreeNode * father
Definition: TreeNode.h:171
static void error(const char *msg)
Definition: mkptypes.cxx:96
TreeNode * TREE_load(const char *path, TreeRoot *troot, char **commentPtr, bool allow_length_scaling, char **warningPtr)
Definition: TreeRead.cxx:620
expectation_group & add(const expectation &e)
Definition: test_unit.h:812
#define that(thing)
Definition: test_unit.h:1043
#define TEST_EXPECT_ZERO_OR_SHOW_ERRNO(iocond)
Definition: test_unit.h:1090
static SearchTree * tree[SEARCH_PATTERNS]
Definition: ED4_search.cxx:629
TreeNode * load()
Definition: TreeRead.cxx:86
TreeNode * leftson
Definition: TreeNode.h:171
char * GBS_log_action_to(const char *comment, const char *action, bool stamp)
Definition: adstring.cxx:976
#define does_differ_from_NULL()
Definition: test_unit.h:1029
GBT_LEN rightlen
Definition: TreeNode.h:172
#define is_equal_to(val)
Definition: test_unit.h:1025
GBT_LEN get_max_found_branchlen() const
Definition: TreeRead.cxx:123
#define does_contain(val)
Definition: test_unit.h:1040
fputs(TRACE_PREFIX, stderr)
GB_ERROR GB_export_errorf(const char *templat,...)
Definition: arb_msg.cxx:262
bool is_leaf() const
Definition: TreeNode.h:211
#define TEST_EXPECT_NULL(n)
Definition: test_unit.h:1322
static list< LineAttachedMessage > warnings
GB_ERROR close(GB_ERROR error)
Definition: arbdbpp.cxx:35
const char * name_only(const char *fullpath)
Definition: AWTI_import.cxx:46
#define __ATTR__USERESULT
Definition: attributes.h:58
char * name
Definition: TreeNode.h:174
void announce_tree_constructed()
Definition: TreeNode.h:405
static TreeNode * createLinkedTreeNode(const TreeRoot &nodeMaker, TreeNode *left, GBT_LEN leftlen, TreeNode *right, GBT_LEN rightlen)
Definition: TreeRead.cxx:483
void GBT_message(GBDATA *gb_main, const char *msg)
Definition: adtools.cxx:238
float GB_atof(const char *str)
Definition: arbdb.cxx:190
float GBT_LEN
Definition: arbdb_base.h:34
#define NULp
Definition: cxxforward.h:116
void add_warning(const char *msg)
Definition: TreeRead.cxx:109
void markAsLeaf()
Definition: TreeNode.h:212
GB_transaction ta(gb_var)
void destroy(TreeNode *that)
Definition: TreeNode.h:600
GBDATA * gb_main
Definition: adname.cxx:32
const char * get_remark() const
Definition: TreeNode.h:307
void delete_by_node()
Definition: TreeNode.h:94
#define STATIC_ASSERT(const_expression)
Definition: static_assert.h:37
#define NAME_TOO_LONG
#define TEST_EXPECT_EQUAL(expr, want)
Definition: test_unit.h:1294
size_t get_position() const
Definition: arb_strbuf.h:112
char * GBS_global_string_copy(const char *templat,...)
Definition: arb_msg.cxx:194
const char * label
void put(char c)
Definition: arb_strbuf.h:174
#define UNCOVERED()
Definition: arb_assert.h:380
#define MAX_DROPPED_GROUP_WARN
Definition: TreeRead.cxx:26
#define max(a, b)
Definition: f2c.h:154
GB_write_int const char s
Definition: AW_awar.cxx:154