ARB
TreeRead.cxx
Go to the documentation of this file.
1 // ============================================================ //
2 // //
3 // File : TreeRead.cxx //
4 // Purpose : load tree from file //
5 // //
6 // Institute of Microbiology (Technical University Munich) //
7 // www.arb-home.de //
8 // //
9 // ============================================================ //
10 
11 #include "TreeRead.h"
12 #include <TreeNode.h>
13 
14 #include <arb_msg_fwd.h>
15 #include <arb_strbuf.h>
16 #include <arb_file.h>
17 #include <arb_defs.h>
18 #include <algorithm>
19 
20 #define tree_assert(cond) arb_assert(cond)
21 
22 /*!******************************************************************************************
23  load a tree from file system
24 ********************************************************************************************/
25 
26 #define MAX_DROPPED_GROUP_WARN 100
27 
28 // --------------------
29 // TreeReader
30 
31 class TreeReader : virtual Noncopyable {
32  enum tr_lfmode { LF_UNKNOWN, LF_N, LF_R, LF_NR, LF_RN, };
33 
34  int unnamed_counter;
35  char *tree_file_name;
36  FILE *in;
37  int last_character; // may be EOF
38  int line_cnt;
39 
40  GBS_strstruct tree_comment;
41  GBT_LEN max_found_branchlen;
42  double max_found_bootstrap;
43  tr_lfmode lfmode;
44 
45  char *warnings;
46 
47  TreeRoot *troot;
48 
49  struct Count {
50  int dropped_leaf_groups;
51  int dropped_duplicated_groups;
52  Count() : dropped_leaf_groups(0), dropped_duplicated_groups(0) {}
53  } count;
54 
55  void setError(const char *message);
56  void setErrorAt(const char *message);
57  void setExpectedError(const char *expected);
58 
59  int get_char();
60  int read_char();
61  int read_tree_char(); // extracts comments and ignores whitespace outside comments
62 
63  char *content_ahead(size_t how_many, bool show_eof);
64 
65  void drop_tree_char(char expected);
66 
67  void setBranchName_acceptingBootstrap(TreeNode *node, char*& name);
68 
69  // The eat-functions below assume that the "current" character
70  // has already been read into 'last_character':
71  void eat_white();
72  __ATTR__USERESULT bool eat_number(GBT_LEN& result);
73  char *eat_quoted_string();
74  bool eat_and_set_name_and_length(TreeNode *node, GBT_LEN& len);
75 
76  char *unnamedNodeName() { return GBS_global_string_copy("unnamed%i", ++unnamed_counter); }
77 
78  TreeNode *load_subtree(GBT_LEN& nodeLen);
79  TreeNode *load_named_node(GBT_LEN& nodeLen);
80 
81 public:
82 
83  TreeReader(FILE *input, const char *file_name, TreeRoot *troot_);
84  ~TreeReader();
85 
87  GBT_LEN rootNodeLen = DEFAULT_BRANCH_LENGTH_MARKER; // ignored dummy
88  TreeNode *tree = load_named_node(rootNodeLen);
89 
90  if (!error) {
91  if (rootNodeLen != DEFAULT_BRANCH_LENGTH_MARKER && rootNodeLen != 0.0) {
92  add_warning("Length specified for root-node has been ignored");
93  }
94 
95  // check for unexpected input
96  if (last_character == ';') read_tree_char(); // accepts ';'
97  if (last_character != EOF) {
98  char *unused_input = content_ahead(30, false);
99  add_warningf("Unexpected input-data after tree: '%s'", unused_input);
100  free(unused_input);
101  }
103  }
104  return tree;
105  }
106 
108 
109  void add_warning(const char *msg) {
110  if (warnings) freeset(warnings, GBS_global_string_copy("%s\n%s", warnings, msg));
111  else warnings = GBS_global_string_copy("Warning(s): %s", msg);
112  }
113  __ATTR__FORMAT(2) void add_warningf(const char *format, ...) { FORWARD_FORMATTED(add_warning, format); }
114 
115  GB_ERROR get_warnings() const { return warnings; } // valid until TreeReader is destroyed
116 
117  char *takeComment() {
118  // can only be called once (further calls will return NULp)
119  return tree_comment.release();
120  }
121 
122  double get_max_found_bootstrap() const { return max_found_bootstrap; }
123  GBT_LEN get_max_found_branchlen() const { return max_found_branchlen; }
124 };
125 
126 TreeReader::TreeReader(FILE *input, const char *file_name, TreeRoot *troot_)
127  : unnamed_counter(0),
128  tree_file_name(strdup(file_name)),
129  in(input),
130  last_character(0),
131  line_cnt(1),
132  tree_comment(2048),
133  max_found_branchlen(-1),
134  max_found_bootstrap(-1),
135  lfmode(LF_UNKNOWN),
136  warnings(NULp),
137  troot(troot_),
138  error(NULp)
139 {
140  read_tree_char();
141 }
142 
144  free(warnings);
145  free(tree_file_name);
146 }
147 
148 void TreeReader::setError(const char *message) {
149  tree_assert(!error);
150  error = GBS_global_string("Error reading %s:%i: %s",
151  tree_file_name, line_cnt, message);
152 }
153 char *TreeReader::content_ahead(size_t how_many, bool show_eof) {
154  char show[how_many+1+4]; // 4 = oversize of '<EOF>'
155  size_t i;
156  for (i = 0; i<how_many; ++i) {
157  show[i] = last_character;
158  if (show[i] == EOF) {
159  if (show_eof) {
160  strcpy(show+i, "<EOF>");
161  i += 5;
162  }
163  break;
164  }
165  read_char();
166  }
167  show[i] = 0;
168  return strdup(show);
169 }
170 
171 void TreeReader::setErrorAt(const char *message) {
172  if (last_character == EOF) {
173  setError(GBS_global_string("%s while end-of-file was reached", message));
174  }
175  else {
176  char *show = content_ahead(30, true);
177  setError(GBS_global_string("%s while looking at '%s'", message, show));
178  free(show);
179  }
180 }
181 
182 void TreeReader::setExpectedError(const char *expected) {
183  setErrorAt(GBS_global_string("Expected %s", expected));
184 }
185 
186 int TreeReader::get_char() {
187  // reads character from stream
188  // - converts linefeeds for DOS- and MAC-textfiles
189  // - increments line_cnt
190 
191  int c = getc(in);
192  int inc = 0;
193 
194  if (c == '\n') {
195  switch (lfmode) {
196  case LF_UNKNOWN: lfmode = LF_N; inc = 1; break;
197  case LF_N: inc = 1; break;
198  case LF_R: lfmode = LF_RN; c = get_char(); break;
199  case LF_NR: c = get_char(); break;
200  case LF_RN: inc = 1; break;
201  }
202  }
203  else if (c == '\r') {
204  switch (lfmode) {
205  case LF_UNKNOWN: lfmode = LF_R; inc = 1; break;
206  case LF_R: inc = 1; break;
207  case LF_N: lfmode = LF_NR; c = get_char(); break;
208  case LF_RN: c = get_char(); break;
209  case LF_NR: inc = 1; break;
210  }
211  if (c == '\r') c = '\n'; // never report '\r'
212  }
213  if (inc) line_cnt++;
214 
215  return c;
216 }
217 
218 int TreeReader::read_tree_char() {
219  // reads over tree comment(s) and whitespace.
220  // tree comments are stored inside TreeReader
221 
222  bool done = false;
223  int c = ' ';
224 
225  while (!done && !error) {
226  c = get_char();
227  if (c == ' ' || c == '\t' || c == '\n') ; // skip
228  else if (c == '[') { // collect tree comment(s)
229  int openBrackets = 1;
230  if (tree_comment.get_position()) {
231  tree_comment.put('\n'); // not first comment -> add new line
232  }
233 
234  while (openBrackets && !error) {
235  c = get_char();
236  switch (c) {
237  case EOF:
238  setError("Reached end of file while reading comment");
239  break;
240  case ']':
241  openBrackets--;
242  if (openBrackets) tree_comment.put(c); // write all but last closing brackets
243  break;
244  case '[':
245  openBrackets++;
246  // fall-through
247  default:
248  tree_comment.put(c);
249  break;
250  }
251  }
252  }
253  else done = true;
254  }
255 
256  last_character = c;
257  return c;
258 }
259 
260 int TreeReader::read_char() {
261  int c = get_char();
262  last_character = c;
263  return c;
264 }
265 
266 void TreeReader::eat_white() {
267  int c = last_character;
268  while ((c == ' ') || (c == '\n') || (c == '\r') || (c == '\t')) {
269  c = read_char();
270  }
271 }
272 
273 bool TreeReader::eat_number(GBT_LEN& result) {
274  char strng[256];
275  char *s = strng;
276  int c = last_character;
277 
278  while (((c<='9') && (c>='0')) || (c=='.') || (c=='-') || (c=='+') || (c=='e') || (c=='E')) {
279  *(s++) = c;
280  c = read_char();
281  }
282  *s = 0;
283  result = GB_atof(strng);
284  eat_white();
285 
286  bool consumed_some_length = strng[0];
287  return consumed_some_length;
288 }
289 
290 char *TreeReader::eat_quoted_string() {
300  const int MAX_NAME_LEN = 1000;
301 
302  char buffer[MAX_NAME_LEN+2];
303  char *s = buffer;
304  int c = last_character;
305 
306 #define NAME_TOO_LONG ((s-buffer)>MAX_NAME_LEN)
307 
308  if (c == '\'' || c == '"') {
309  char found_quote = c;
310 
311  c = read_char();
312  while (c!=EOF && c!=found_quote) {
313  *(s++) = c;
314  if (NAME_TOO_LONG) { c = 0; break; }
315  c = read_char();
316  }
317  if (c == found_quote) c = read_tree_char();
318  }
319  else {
320 #if 0
321  // previous behavior: skip prefixes matching PRE '_* *'
322  // (reason unknown; behavior exists since [2])
323  // conflicts with replacement of problematic character done in ../TREE_WRITE/TreeWrite.cxx@replace_by_underscore
324  // -> disabled
325  while (c == '_') c = read_tree_char();
326  while (c == ' ') c = read_tree_char();
327 #endif
328  while (c!=':' && c!=EOF && c!=',' && c!=';' && c != ')') {
329  *(s++) = c;
330  if (NAME_TOO_LONG) break;
331  c = read_tree_char();
332  }
333  }
334  *s = 0;
335  if (NAME_TOO_LONG) {
336  setError(GBS_global_string("Name '%s' is longer than %i bytes", buffer, MAX_NAME_LEN));
337  return NULp;
338  }
339  return strdup(buffer);
340 }
341 
342 void TreeReader::setBranchName_acceptingBootstrap(TreeNode *node, char*& name) {
343  // store groupname and/or bootstrap value.
344  //
345  // ARBs extended newick format allows 3 kinds of node-names:
346  // 'groupname'
347  // 'bootstrap'
348  // 'bootstrap:groupname' (needs to be quoted)
349  //
350  // where
351  // 'bootstrap' is sth interpretable as double (optionally followed by '%')
352  // 'groupname' is sth not interpretable as double
353  //
354  // If a groupname is detected, it is stored in node->name
355  // If a bootstrap is detected, it is stored in node->remark_branch
356  //
357  // Bootstrap values will be scaled up by factor 100.
358  // Wrong scale-ups (to 10000) will be corrected by calling TREE_scale() after the whole tree has been loaded.
359 
360  char *new_name = NULp;
361  {
362  char *end = NULp;
363  double bootstrap = strtod(name, &end);
364 
365  bool is_bootstrap = (end != name);
366  if (is_bootstrap) {
367  if (end[0] == '%') {
368  ++end;
369  bootstrap = bootstrap/100.0; // percent -> [0..1]
370  }
371  is_bootstrap = end[0] == ':' || !end[0]; // followed by ':' or at EOS
372  }
373 
374  if (is_bootstrap) {
375  bootstrap = bootstrap*100.0; // needed if bootstrap values are between 0.0 and 1.0 (downscaling is done later)
376  if (bootstrap > max_found_bootstrap) { max_found_bootstrap = bootstrap; }
377 
378  if (node->get_remark()) {
379  error = "Invalid duplicated bootstrap specification detected";
380  }
381  else {
382  node->set_bootstrap(bootstrap);
383  }
384 
385  if (end[0] != 0) { // sth behind bootstrap value
386  arb_assert(end[0] == ':');
387  new_name = strdup(end+1);
388  }
389  free(name);
390  }
391  else {
392  new_name = name; // use whole input as groupname
393  }
394  name = NULp;
395  }
396  if (new_name) {
397  if (node->name) {
398  if (node->is_leaf()) {
399  if (count.dropped_leaf_groups<MAX_DROPPED_GROUP_WARN) {
400  add_warningf("Dropped group name specified for a single-node-subtree ('%s')", new_name);
401  if (++count.dropped_leaf_groups == MAX_DROPPED_GROUP_WARN) {
402  add_warning("[Note: further warnings of this type will be suppressed]");
403  }
404  }
405  freenull(new_name);
406  }
407  else {
408  if (count.dropped_duplicated_groups<MAX_DROPPED_GROUP_WARN) {
409  add_warningf("Duplicated group name specification detected: dropped inner ('%s'), kept outer group name ('%s')",
410  node->name, new_name);
411  if (++count.dropped_duplicated_groups == MAX_DROPPED_GROUP_WARN) {
412  add_warning("[Note: further warnings of this type will be suppressed]");
413  }
414  }
415  freeset(node->name, new_name);
416  }
417  }
418  else {
419  node->name = new_name;
420  }
421  }
422 }
423 
424 void TreeReader::drop_tree_char(char expected) {
425  if (last_character != expected) {
426  setExpectedError(GBS_global_string("'%c'", expected));
427  }
428  read_tree_char();
429 }
430 
431 bool TreeReader::eat_and_set_name_and_length(TreeNode *node, GBT_LEN& nodeLen) {
432  // reads optional branch-length and -name
433  //
434  // if 'nodeLen' contains DEFAULT_BRANCH_LENGTH_MARKER, it gets overwritten with any found length-specification
435  // otherwise found length is added to 'nodeLen'
436  //
437  // sets the branch-name of 'node', if a name is found (e.g. sth like "(...)'name':0.5")
438  //
439  // returns true if successful, false otherwise (TreeReader::error is set then)
440 
441  bool done = false;
442  bool length_consumed = false;
443 
444  while (!done && !error) {
445  switch (last_character) {
446  case ';':
447  case ',':
448  case ')':
449  done = true;
450  break;
451  case ':':
452  if (!error && length_consumed) setErrorAt("Unexpected ':' (already read a branchlength)");
453  if (!error) drop_tree_char(':');
454  if (!error) {
455  GBT_LEN foundlen;
456  if (eat_number(foundlen)) {
457  if (is_marked_as_default_len(nodeLen)) {
458  nodeLen = foundlen;
459  }
460  else {
461  tree_assert(node->is_leaf()); // should only happen when a single leaf in parenthesis was read
462  nodeLen += foundlen; // sum leaf and node lengths
463  }
464  max_found_branchlen = std::max(max_found_branchlen, nodeLen);
465  }
466  else {
467  setExpectedError("valid length");
468  }
469  }
470  length_consumed = true;
471  break;
472 
473  case EOF:
474  done = true;
475  break;
476 
477  default: {
478  char *branchName = eat_quoted_string();
479  if (branchName) {
480  if (branchName[0]) setBranchName_acceptingBootstrap(node, branchName);
481  }
482  else {
483  UNCOVERED();
484  setExpectedError("branch-name or one of ':;,)'");
485  }
486  break;
487  }
488  }
489  }
490 
491  return !error;
492 }
493 
494 static TreeNode *createLinkedTreeNode(const TreeRoot& nodeMaker, TreeNode *left, GBT_LEN leftlen, TreeNode *right, GBT_LEN rightlen) { // @@@ move into class GBT_tree (as ctor) - or better move into TreeNodeFactory
495  TreeNode *node = nodeMaker.makeNode();
496 
497  node->leftson = left;
498  node->leftlen = leftlen;
499  node->rightson = right;
500  node->rightlen = rightlen;
501 
502  left->father = node;
503  right->father = node;
504 
505  return node;
506 }
507 
508 TreeNode *TreeReader::load_named_node(GBT_LEN& nodeLen) {
509  // reads a node or subtree.
510  // a single node is expected to have a name (or will be auto-named)
511  // subtrees may have a name (groupname)
512  TreeNode *node = NULp;
513 
514  if (last_character == '(') {
515  node = load_subtree(nodeLen);
516  }
517  else { // single node
518  eat_white();
519  char *name = eat_quoted_string();
520  if (name) {
521  if (!name[0]) freeset(name, unnamedNodeName());
522 
523  node = troot->makeNode();
524  node->name = name;
525  node->markAsLeaf();
526  }
527  else {
528  UNCOVERED();
529  setExpectedError("(quoted) string");
530  }
531  }
532  if (node && !error) {
533  if (!eat_and_set_name_and_length(node, nodeLen)) {
534  node->forget_origin();
535  destroy(node, troot);
536  node = NULp;
537  }
538  }
539  tree_assert(contradicted(node, error));
540  tree_assert(!node || !node->is_leaf() || node->name); // leafs need to be named here
541  return node;
542 }
543 
544 
545 TreeNode *TreeReader::load_subtree(GBT_LEN& nodeLen) {
546  // loads a subtree (i.e. expects parenthesis around one or several nodes)
547  //
548  // 'nodeLen' normally is set to DEFAULT_BRANCH_LENGTH_MARKER
549  // or to length of single node (if parenthesis contain only one node)
550  //
551  // length and/or name behind '(...)' are not parsed (has to be done by caller).
552  //
553  // if subtree contains a single node (or a single other subtree), 'name'+'remark_branch' are
554  // already set, when load_subtree() returns - otherwise they are NULp.
555 
556  TreeNode *node = NULp;
557 
558  drop_tree_char('(');
559 
561  TreeNode *left = load_named_node(leftLen);
562 
563  if (left) {
564  switch (last_character) {
565  case ')': // single node
566  nodeLen = leftLen;
567  node = left;
568  left = NULp;
569  break;
570 
571  case ',': {
573  TreeNode *right = NULp;
574 
575  while (last_character == ',' && !error) {
576  if (right) { // multi-branch
577  TreeNode *pair = createLinkedTreeNode(*troot, left, leftLen, right, rightLen);
578 
579  left = pair; leftLen = 0;
580  right = NULp; rightLen = DEFAULT_BRANCH_LENGTH_MARKER;
581  }
582 
583  drop_tree_char(',');
584  if (!error) {
585  right = load_named_node(rightLen);
586  }
587  }
588 
589  if (!error) {
590  if (last_character == ')') {
591  node = createLinkedTreeNode(*troot, left, leftLen, right, rightLen);
593 
594  left = NULp;
595  right = NULp;
596  }
597  else {
598  setExpectedError("one of ',)'");
599  }
600  }
601 
602  if (right) {
603  right->forget_origin();
604  destroy(right, troot);
605  }
606  if (error && node) {
607  node->forget_origin();
608  destroy(node, troot);
609  node = NULp;
610  }
611 
612  break;
613  }
614 
615  default:
616  setExpectedError("one of ',)'");
617  break;
618  }
619  if (left) {
620  left->forget_origin();
621  destroy(left, troot);
622  }
623  }
624 
625  if (!error) drop_tree_char(')');
626 
627  tree_assert(contradicted(node, error));
628  return node;
629 }
630 
631 TreeNode *TREE_load(const char *path, TreeRoot *troot, char **commentPtr, bool allow_length_scaling, char **warningPtr) {
632  /* Load a newick compatible tree from file 'path',
633  if commentPtr is specified -> set it to a malloc copy of all concatenated comments found in tree file
634  if warningPtr is specified -> set it to a malloc copy of any warnings occurring during tree-load (e.g. autoscale- or informational warnings)
635  */
636 
637  TreeNode *tree = NULp;
638  FILE *input = fopen(path, "rt");
639  GB_ERROR error = NULp;
640  bool own_root = true;
641 
642  if (!input) {
643  error = GBS_global_string("No such file: %s", path);
644  }
645  else {
646  const char *name_only = strrchr(path, '/');
647  if (name_only) ++name_only;
648  else name_only = path;
649 
650  TreeReader reader(input, name_only, troot);
651  if (!reader.error) {
652  tree = reader.load();
653  if (tree) own_root = false;
654  }
655  fclose(input);
656 
657  if (reader.error) error = reader.error;
658  else if (tree && tree->is_leaf()) error = "tree is too small (need at least 2 species)";
659 
660  if (error) {
661  destroy(tree);
662  tree = NULp;
663  }
664 
665  if (tree) {
666  double bootstrap_scale = 1.0;
667  double branchlen_scale = 1.0;
668 
669  if (reader.get_max_found_bootstrap() >= 101.0) { // bootstrap values were given in percent
670  bootstrap_scale = 0.01;
671  reader.add_warningf("Auto-scaling bootstrap values by factor %.2f (max. found bootstrap was %5.2f)",
672  bootstrap_scale, reader.get_max_found_bootstrap());
673  }
674  if (reader.get_max_found_branchlen() >= 1.1) { // assume branchlengths have range [0;100]
675  if (allow_length_scaling) {
676  branchlen_scale = 0.01;
677  reader.add_warningf("Auto-scaling branchlengths by factor %.2f (max. found branchlength = %.2f)\n"
678  "(use ARB_NT/Tree/Modify branches/Scale branchlengths with factor %.2f to undo auto-scaling)",
679  branchlen_scale, reader.get_max_found_branchlen(), 1.0/branchlen_scale);
680  }
681  }
682 
683  TREE_scale(tree, branchlen_scale, bootstrap_scale); // scale bootstraps and branchlengths
684 
685  if (warningPtr) {
686  const char *wmsg = reader.get_warnings();
687  if (wmsg) *warningPtr = strdup(wmsg);
688  }
689 
690  if (commentPtr) {
691  char *comment = reader.takeComment();
692 
693  const char *loaded_from = GBS_global_string("Loaded from %s", path);
694  freeset(comment, GBS_log_action_to(comment, loaded_from, true));
695 
696  tree_assert(!*commentPtr);
697  *commentPtr = comment;
698  }
699  }
700  }
701 
702  tree_assert(tree||error);
703  if (error) {
704  GB_export_errorf("Import tree: %s", error);
705  tree_assert(!tree);
706  if (own_root) troot->delete_by_node();
707  }
708 
709  return tree;
710 }
711 
712 GB_ERROR TREE_load_to_db(GBDATA *gb_main, const char *treefile, const char *tree_name) {
713  GB_ERROR error = NULp;
714 
715  char *warnings = NULp;
716  char *tree_comment = NULp;
717 
718  TreeNode *tree = TREE_load(treefile, new SimpleRoot, &tree_comment, true, &warnings);
719 
720  if (!tree) error = GB_await_error();
721  else {
722  if (warnings) GBT_message(gb_main, warnings);
723 
724  {
725  GB_transaction ta(gb_main);
726  error = GBT_write_tree_with_remark(gb_main, tree_name, tree, tree_comment);
727  error = ta.close(error);
728  }
729 
730  destroy(tree);
731  }
732 
733  free(warnings);
734  free(tree_comment);
735 
736  return error;
737 }
738 
739 // --------------------------------------------------------------------------------
740 
741 #ifdef UNIT_TESTS
742 #ifndef TEST_UNIT_H
743 #include <test_unit.h>
744 #endif
745 
746 static TreeNode *loadFromFileContaining(const char *treeString, char **warningsPtr) {
747  const char *filename = "trees/tmp.tree";
748  FILE *out = fopen(filename, "wt");
749  TreeNode *tree = NULp;
750 
751  if (out) {
752  fputs(treeString, out);
753  fclose(out);
754  tree = TREE_load(filename, new SimpleRoot, NULp, false, warningsPtr);
755  }
756  else {
757  GB_export_IO_error("save tree", filename);
758  }
759 
760  return tree;
761 }
762 
763 static arb_test::match_expectation loading_tree_failed_with(TreeNode *tree, const char *errpart) {
764  using namespace arb_test;
765  expectation_group expected;
766 
767  expected.add(that(tree).is_equal_to_NULL());
768  expected.add(that(GB_have_error()).is_equal_to(true));
769  if (GB_have_error()) {
770  expected.add(that(GB_await_error()).does_contain(errpart));
771  }
772  return all().ofgroup(expected);
773 }
774 
775 static arb_test::match_expectation loading_tree_succeeds(TreeNode *tree, const char *newick_expected) {
776  using namespace arb_test;
777  expectation_group expected;
778 
779  expected.add(that(tree).does_differ_from_NULL());
780  expected.add(that(GB_get_error()).is_equal_to_NULL());
781  if (!GB_have_error() && tree) {
782  char *newick = GBT_tree_2_newick(tree, nSIMPLE, false);
783  expected.add(that(newick).is_equal_to(newick_expected));
784  free(newick);
785  }
786  return all().ofgroup(expected);
787 }
788 
789 #define TEST_EXPECT_TREELOAD_FAILED_WITH(tree,errpart) TEST_EXPECTATION(loading_tree_failed_with(tree, errpart))
790 #define TEST_EXPECT_TREELOAD_FAILED_WITH__BROKEN(tree,errpart) TEST_EXPECTATION__BROKEN(loading_tree_failed_with(tree, errpart))
791 
792 #define TEST_EXPECT_TREELOAD(tree,newick) TEST_EXPECTATION(loading_tree_succeeds(tree,newick))
793 #define TEST_EXPECT_TREELOAD__BROKEN(tree,newick) TEST_EXPECTATION__BROKEN(loading_tree_succeeds(tree,newick))
794 
795 #define TEST_EXPECT_TREEFILE_FAILS_WITH(name,errpart) do { \
796  TreeNode *tree = TREE_load(name, new SimpleRoot, NULp, false, NULp); \
797  TEST_EXPECT_TREELOAD_FAILED_WITH(tree, errpart); \
798  } while(0)
799 
800 #define TEST_EXPECT_TREESTRING_FAILS_WITH(treeString,errpart) do { \
801  TreeNode *tree = loadFromFileContaining(treeString, NULp); \
802  TEST_EXPECT_TREELOAD_FAILED_WITH(tree, errpart); \
803  } while(0)
804 
805 // argument 'newick' is vs regression only!
806 #define TEST_EXPECT_TREESTRING_FAILS_WITH__BROKEN(treeString,errpart,newick) do { \
807  char *warnings = NULp; \
808  TreeNode *tree = loadFromFileContaining(treeString, &warnings); \
809  TEST_EXPECT_TREELOAD_FAILED_WITH__BROKEN(tree, errpart); \
810  TEST_EXPECT_TREELOAD(tree, newick); \
811  TEST_EXPECT_NULL(warnings); \
812  delete tree; \
813  free(warnings); \
814  } while(0)
815 
816 #define TEST_EXPECT_TREESTRING_OK(treeString,newick) do { \
817  char *warnings = NULp; \
818  TreeNode *tree = loadFromFileContaining(treeString, &warnings); \
819  TEST_EXPECT_TREELOAD(tree, newick); \
820  TEST_EXPECT_NULL(warnings); \
821  destroy(tree); \
822  free(warnings); \
823  } while(0)
824 
825 #define TEST_EXPECT_TREESTRING_OK_WITH_WARNING(treeString,newick,warnPart) do { \
826  char *warnings = NULp; \
827  TreeNode *tree = loadFromFileContaining(treeString, &warnings); \
828  TEST_EXPECT_TREELOAD(tree, newick); \
829  TEST_REJECT_NULL(warnings); \
830  TEST_EXPECT_CONTAINS(warnings, warnPart); \
831  destroy(tree); \
832  free(warnings); \
833  } while(0)
834 
835 #define TEST_EXPECT_TREESTRING_OK__BROKEN(treeString,newick) do { \
836  TreeNode *tree = loadFromFileContaining(treeString, NULp); \
837  TEST_EXPECT_TREELOAD__BROKEN(tree, newick); \
838  } while(0)
839 
840 void TEST_load_tree() {
841  // just are few tests covering most of this module.
842  // more load tests are in ../../TOOLS/arb_test.cxx@TEST_SLOW_arb_read_tree
843 
844  // simple succeeding tree load
845  {
846  char *comment = NULp;
847  TreeNode *tree = TREE_load("trees/test.tree", new SimpleRoot, &comment, false, NULp);
848  // -> ../../UNIT_TESTER/run/trees/test.tree
849 
850  TEST_EXPECT_TREELOAD(tree, "(((s1,s2),(s3,s 4)),(s5,s-6));");
851  if (tree) {
852  TEST_REJECT_NULL(comment);
853  TEST_EXPECT_CONTAINS(comment,
854  // comment part from treefile:
855  "tree covering most of tree reader code\n"
856  "comment contains [extra brackets] inside comment\n");
857  TEST_EXPECT_CONTAINS(comment,
858  // comment as appended by load:
859  ": Loaded from trees/test.tree\n");
860  }
861  free(comment);
862  destroy(tree);
863  }
864 
865  // detailed load tests (checking branchlengths and nodenames)
866  {
867  const char *treestring[] = {
868  "(node1,node2)rootgroup;", // [0] tree with a named root
869  "(node1:0.00,(node2, node3:0.57)):0;", // [1] test tree lengths (esp. length zero)
870  "(((((a))single)), ((b, c)17%:0.2));", // [2] test single-node-subtree name-conflict
871 
872  "((a,b)17,(c,d)33.3,(e,f)12.5:0.2);", // [3] test bootstraps
873  "((a,b)G,(c,d)H,(e,f)I:0.2);", // [4] test groupnames w/o bootstraps
874  "((a,b)'17:G',(c,d)'33.3:H',(e,f)'12.5:I':0.2);", // [5] test groupnames with bootstraps
875  "((a,b)17G,(c,d)33.3H,(e,f)12.5I:0.2)", // [6] test groupnames + bootstraps w/o separator
876 
877  "((a,b)'17%:G',(c,d)'33.3%:H',(e,f)'12.5%:I':0.2);", // [7] test bootstraps with percent spec
878  "((a,b)'0.17:G',(c,d)'0.333:H',(e,f)'0.125:I':0.2);", // [8] test bootstraps in range [0..1]
879  };
880 
881  const char *expected_newick[] = {
882  "(node1,node2);",
883  "(node1,(node2,node3));",
884  "(a,(b,c));",
885 
886  "(((a,b),(c,d)),(e,f));",
887  "(((a,b),(c,d)),(e,f));",
888  "(((a,b),(c,d)),(e,f));",
889  "(((a,b),(c,d)),(e,f));",
890 
891  "(((a,b),(c,d)),(e,f));",
892  "(((a,b),(c,d)),(e,f));",
893  };
894  const char *expected_warnings[] = {
895  NULp,
896  NULp,
897  "Dropped group name specified for a single-node-subtree",
898 
899  "Auto-scaling bootstrap values by factor 0.01",
900  NULp,
901  "Auto-scaling bootstrap values by factor 0.01",
902  NULp,
903  NULp, // no auto-scaling shall occur here (bootstraps are already specified as percent)
904  NULp, // no auto-scaling shall occur here (bootstraps are in [0..1])
905  };
906 
907  STATIC_ASSERT(ARRAY_ELEMS(expected_newick) == ARRAY_ELEMS(treestring));
908  STATIC_ASSERT(ARRAY_ELEMS(expected_warnings) == ARRAY_ELEMS(treestring));
909 
910  for (size_t i = 0; i<ARRAY_ELEMS(treestring); ++i) {
911  TEST_ANNOTATE(GBS_global_string("for tree #%zu = '%s'", i, treestring[i]));
912  char *warnings = NULp;
913  TreeNode *tree = loadFromFileContaining(treestring[i], &warnings);
914  TEST_EXPECT_TREELOAD(tree, expected_newick[i]);
915  switch (i) {
916  case 0:
917  TEST_EXPECT_EQUAL(tree->name, "rootgroup");
918  break;
919  case 1:
920  TEST_EXPECT_EQUAL(tree->leftlen, 0);
922  TEST_EXPECT_EQUAL(tree->rightson->rightlen, 0.57);
923  break;
924  case 2:
925  // test bootstrap with percent-specification is parsed correctly
927  TEST_EXPECT_EQUAL(tree->rightson->get_remark(), "17%");
928  TEST_EXPECT_EQUAL(tree->rightlen, 0.2);
929  break;
930 
931  case 3:
932  case 4:
933  case 5:
934  case 6:
935  case 7:
936  case 8:
937  // check bootstraps
939  switch (i) {
940  case 4:
941  case 6:
945  break;
946  case 3:
947  case 5:
948  case 7:
949  case 8:
950  TEST_EXPECT_EQUAL(tree->leftson->leftson->get_remark(), "17%");
951  TEST_EXPECT_EQUAL(tree->leftson->rightson->get_remark(), "33%");
952  TEST_EXPECT_EQUAL(tree->rightson->get_remark(), "13%");
953  break;
954  default:
955  TEST_REJECT(true); // unhandled tree
956  break;
957  }
958 
959  // check node-names
960  TEST_EXPECT_NULL(tree->name);
961  TEST_EXPECT_NULL(tree->leftson->name);
962  switch (i) {
963  case 6:
964  // check un-separated digits are treated as strange names
965  // (previously these were accepted as bootstraps)
966  TEST_EXPECT_EQUAL(tree->leftson->leftson->name, "17G");
967  TEST_EXPECT_EQUAL(tree->leftson->rightson->name, "33.3H");
968  TEST_EXPECT_EQUAL(tree->rightson->name, "12.5I");
969  break;
970  case 4:
971  case 5:
972  case 8:
973  case 7:
974  TEST_EXPECT_EQUAL(tree->leftson->leftson->name, "G");
975  TEST_EXPECT_EQUAL(tree->leftson->rightson->name, "H");
976  TEST_EXPECT_EQUAL(tree->rightson->name, "I");
977  break;
978  case 3:
982  break;
983  default:
984  TEST_REJECT(true); // unhandled tree
985  break;
986  }
987 
988  // expect_no_lengths:
989  TEST_EXPECT_EQUAL(tree->leftlen, 0); // multifurcation
992  TEST_EXPECT_EQUAL(tree->rightlen, 0.2);
993  break;
994 
995  default:
996  TEST_REJECT(true); // unhandled tree
997  break;
998  }
999  if (expected_warnings[i]) {
1000  TEST_REJECT_NULL(warnings);
1001  TEST_EXPECT_CONTAINS(warnings, expected_warnings[i]);
1002  }
1003  else {
1004  TEST_EXPECT_NULL(warnings);
1005  }
1006  free(warnings);
1007  destroy(tree);
1008  }
1009 
1010  TEST_ANNOTATE(NULp);
1011  }
1012 
1013  // test valid trees with strange or wrong behavior
1014  TEST_EXPECT_TREESTRING_OK("(,);", "(unnamed1,unnamed2);"); // tree with 2 unamed species (weird, but ok)
1015  TEST_EXPECT_TREESTRING_OK("( a, (b,(c),d), (e,(f)) );", "((a,((b,c),d)),(e,f));");
1016  TEST_EXPECT_TREESTRING_OK("(((((a)))), ((b, c)));", "(a,(b,c));");
1017 
1018  TEST_EXPECT_TREESTRING_OK_WITH_WARNING("( (a), (((b),(c),(d))group)dupgroup, ((e),(f)) );",
1019  "((a,((b,c),d)),(e,f));",
1020  "Duplicated group name specification detected");
1021 
1022  // test unacceptable trees
1023  {
1024  const char *tooSmallTree[] = {
1025  "();",
1026  "()",
1027  ";",
1028  "",
1029  "(one)",
1030  "((((()))));",
1031  "(((((one)))));",
1032  };
1033 
1034  for (size_t i = 0; i<ARRAY_ELEMS(tooSmallTree); ++i) {
1035  TEST_ANNOTATE(GBS_global_string("for tree #%zu = '%s'", i, tooSmallTree[i]));
1036  TreeNode *tree = loadFromFileContaining(tooSmallTree[i], NULp);
1037  TEST_EXPECT_TREELOAD_FAILED_WITH(tree, "tree is too small");
1038  }
1039  TEST_ANNOTATE(NULp);
1040  }
1041  {
1042  TreeNode *tree = loadFromFileContaining("((a, b)25)20;", NULp);
1043  TEST_EXPECT_TREELOAD_FAILED_WITH(tree, "Invalid duplicated bootstrap specification detected");
1044  }
1045 
1046  // test invalid trees
1047  TEST_EXPECT_TREESTRING_FAILS_WITH("(;);", "Expected one of ',)'");
1048 
1049  TEST_EXPECT_TREESTRING_FAILS_WITH("(17", "Expected one of ',)' while end-of-file was reached");
1050  TEST_EXPECT_TREESTRING_FAILS_WITH("((((", "Expected one of ',)' while end-of-file was reached");
1051  TEST_EXPECT_TREESTRING_FAILS_WITH("(a, 'b", "Expected one of ',)' while end-of-file was reached");
1052 
1053  TEST_EXPECT_TREESTRING_FAILS_WITH("(a, b:5::::", "Unexpected ':' (already read a branchlength) while looking at '::::<EOF>'");
1054  TEST_EXPECT_TREESTRING_FAILS_WITH("(a, b:5:c:d", "Unexpected ':' (already read a branchlength) while looking at ':c:d<EOF>'");
1055  TEST_EXPECT_TREESTRING_FAILS_WITH("(a, b:5:c:d)", "Unexpected ':' (already read a branchlength) while looking at ':c:d)<EOF>'");
1056 
1057  TEST_EXPECT_TREESTRING_FAILS_WITH("[unclosed\ncomment", "while reading comment");
1058  TEST_EXPECT_TREESTRING_FAILS_WITH("[unclosed\ncomment [ bla ]", "while reading comment");
1059 
1060  TEST_EXPECT_TREESTRING_FAILS_WITH("(a, b:d)", "Expected valid length while looking at 'd)<EOF>'");
1061 
1062  // questionable accepted trees / check warnings
1063  TEST_EXPECT_TREESTRING_OK_WITH_WARNING("(a,b):0.5", "(a,b);", "Length specified for root-node has been ignored");
1064  TEST_EXPECT_TREESTRING_OK_WITH_WARNING("(a, b))", "(a,b);", "Unexpected input-data after tree: ')'");
1065 
1066  TEST_EXPECT_TREESTRING_OK("(a*,b%);", "(a*,b%);"); // @@@ really accept such names?
1067  TEST_EXPECT_TREESTRING_OK("(a, b:5)", "(a,b);");
1068 
1069  // check errors
1070  TEST_EXPECT_TREEFILE_FAILS_WITH("trees/nosuch.tree", "No such file");
1071  TEST_EXPECT_TREEFILE_FAILS_WITH("trees/corrupted.tree", "Error reading");
1072 
1073  TEST_EXPECT_ZERO_OR_SHOW_ERRNO(GB_unlink("trees/tmp.tree")); // cleanup
1074 }
1075 
1076 #endif // UNIT_TESTS
1077 
1078 // --------------------------------------------------------------------------------
GB_ERROR GB_get_error()
Definition: arb_msg.cxx:344
GB_ERROR get_warnings() const
Definition: TreeRead.cxx:115
void set_bootstrap(double bootstrap)
Definition: TreeNode.h:282
#define arb_assert(cond)
Definition: arb_assert.h:245
const char * GB_ERROR
Definition: arb_core.h:25
string result
#define MAX_NAME_LEN
group_matcher all()
Definition: test_unit.h:1000
AliDataPtr format(AliDataPtr data, const size_t wanted_len, GB_ERROR &error)
Definition: insdel.cxx:615
GB_ERROR GBT_write_tree_with_remark(GBDATA *gb_main, const char *tree_name, TreeNode *tree, const char *remark)
Definition: adtree.cxx:524
void TREE_scale(TreeNode *tree, double length_scale, double bootstrap_scale)
Definition: TreeTools.cxx:14
char * takeComment()
Definition: TreeRead.cxx:117
char * GBT_tree_2_newick(const TreeNode *tree, NewickFormat format, bool compact)
Definition: adtree.cxx:1353
GB_ERROR error
Definition: TreeRead.cxx:107
void forget_origin()
Definition: TreeNode.h:373
bool is_marked_as_default_len(GBT_LEN len)
Definition: TreeRead.h:20
GB_ERROR TREE_load_to_db(GBDATA *gb_main, const char *treefile, const char *tree_name)
Definition: TreeRead.cxx:712
GB_ERROR GB_export_IO_error(const char *action, const char *filename)
Definition: arb_msg.cxx:325
#define DEFAULT_BRANCH_LENGTH
Definition: arbdbt.h:18
const char * GBS_global_string(const char *templat,...)
Definition: arb_msg.cxx:204
#define FORWARD_FORMATTED(receiver, format)
Definition: arb_msg_fwd.h:19
bool GB_have_error()
Definition: arb_msg.cxx:349
char * release()
Definition: arb_strbuf.h:80
int GB_unlink(const char *path)
Definition: arb_file.cxx:188
#define ARRAY_ELEMS(array)
Definition: arb_defs.h:19
char buffer[MESSAGE_BUFFERSIZE]
Definition: seq_search.cxx:34
GBT_LEN leftlen
Definition: TreeNode.h:132
TreeNode * rightson
Definition: TreeNode.h:131
double get_max_found_bootstrap() const
Definition: TreeRead.cxx:122
__ATTR__FORMAT(2) void add_warningf(const char *format
#define DEFAULT_BRANCH_LENGTH_MARKER
Definition: TreeRead.h:19
#define TEST_EXPECT_CONTAINS(str, part)
Definition: test_unit.h:1301
GB_ERROR GB_await_error()
Definition: arb_msg.cxx:353
#define tree_assert(cond)
Definition: TreeRead.cxx:20
#define is_equal_to_NULL()
Definition: test_unit.h:1017
virtual TreeNode * makeNode() const =0
TreeReader(FILE *input, const char *file_name, TreeRoot *troot_)
Definition: TreeRead.cxx:126
void message(char *errortext)
#define TEST_REJECT(cond)
Definition: test_unit.h:1315
#define TEST_REJECT_NULL(n)
Definition: test_unit.h:1310
TreeNode * father
Definition: TreeNode.h:131
static void error(const char *msg)
Definition: mkptypes.cxx:96
TreeNode * TREE_load(const char *path, TreeRoot *troot, char **commentPtr, bool allow_length_scaling, char **warningPtr)
Definition: TreeRead.cxx:631
expectation_group & add(const expectation &e)
Definition: test_unit.h:801
#define that(thing)
Definition: test_unit.h:1032
#define TEST_EXPECT_ZERO_OR_SHOW_ERRNO(iocond)
Definition: test_unit.h:1079
static SearchTree * tree[SEARCH_PATTERNS]
Definition: ED4_search.cxx:629
TreeNode * load()
Definition: TreeRead.cxx:86
TreeNode * leftson
Definition: TreeNode.h:131
char * GBS_log_action_to(const char *comment, const char *action, bool stamp)
Definition: adstring.cxx:990
#define does_differ_from_NULL()
Definition: test_unit.h:1018
GBT_LEN rightlen
Definition: TreeNode.h:132
#define is_equal_to(val)
Definition: test_unit.h:1014
GBT_LEN get_max_found_branchlen() const
Definition: TreeRead.cxx:123
#define does_contain(val)
Definition: test_unit.h:1029
fputs(TRACE_PREFIX, stderr)
GB_ERROR GB_export_errorf(const char *templat,...)
Definition: arb_msg.cxx:264
bool is_leaf() const
Definition: TreeNode.h:171
#define TEST_EXPECT_NULL(n)
Definition: test_unit.h:1307
static list< LineAttachedMessage > warnings
GB_ERROR close(GB_ERROR error)
Definition: arbdbpp.cxx:32
const char * name_only(const char *fullpath)
Definition: AWTI_import.cxx:46
#define __ATTR__USERESULT
Definition: attributes.h:58
char * name
Definition: TreeNode.h:134
void announce_tree_constructed()
Definition: TreeNode.h:364
static TreeNode * createLinkedTreeNode(const TreeRoot &nodeMaker, TreeNode *left, GBT_LEN leftlen, TreeNode *right, GBT_LEN rightlen)
Definition: TreeRead.cxx:494
void GBT_message(GBDATA *gb_main, const char *msg)
Definition: adtools.cxx:238
float GB_atof(const char *str)
Definition: arbdb.cxx:184
float GBT_LEN
Definition: arbdb_base.h:34
#define NULp
Definition: cxxforward.h:97
void add_warning(const char *msg)
Definition: TreeRead.cxx:109
void markAsLeaf()
Definition: TreeNode.h:172
GB_transaction ta(gb_var)
void destroy(TreeNode *that)
Definition: TreeNode.h:559
GBDATA * gb_main
Definition: adname.cxx:33
const char * get_remark() const
Definition: TreeNode.h:266
void delete_by_node()
Definition: TreeNode.h:94
#define STATIC_ASSERT(const_expression)
Definition: static_assert.h:36
#define NAME_TOO_LONG
#define TEST_EXPECT_EQUAL(expr, want)
Definition: test_unit.h:1283
size_t get_position() const
Definition: arb_strbuf.h:65
char * GBS_global_string_copy(const char *templat,...)
Definition: arb_msg.cxx:195
void put(char c)
Definition: arb_strbuf.h:138
#define UNCOVERED()
Definition: arb_assert.h:380
#define MAX_DROPPED_GROUP_WARN
Definition: TreeRead.cxx:26
#define max(a, b)
Definition: f2c.h:154
GB_write_int const char s
Definition: AW_awar.cxx:156