ARB
TreeWrite.cxx
Go to the documentation of this file.
1 // =============================================================== //
2 // //
3 // File : TreeWrite.cxx //
4 // Purpose : //
5 // //
6 // Institute of Microbiology (Technical University Munich) //
7 // http://www.arb-home.de/ //
8 // //
9 // =============================================================== //
10 
11 #include <TreeWrite.h>
12 #include <TreeNode.h>
13 #include <arb_strbuf.h>
14 #include <arb_file.h>
15 #include <xml.hxx>
16 
17 using namespace std;
18 
19 #define tree_assert(cond) arb_assert(cond)
20 
21 inline void replace_by_underscore(char *str, const char *toReplace) {
22  char *unwanted = strpbrk(str, toReplace);
23  while (unwanted) {
24  unwanted[0] = '_';
25  unwanted = strpbrk(unwanted+1, toReplace);
26  }
27 }
28 
29 inline bool isQuoteChar(char c) { return c == '"' || c == '\''; }
30 inline bool whole_label_quoted(const char *label, size_t length) { return isQuoteChar(label[0]) && label[0] == label[length-1]; }
31 
32 
33 const char *Node_ID_Labeler::speciesLabel(GBDATA *, GBDATA *, TreeNode *leafNode, const char *) const {
34  return leafNode->name;
35 }
36 const char *Node_ID_Labeler::groupLabel(GBDATA *, GBDATA *, TreeNode *innerNode, const char *) const {
37  return innerNode->name;
38 }
39 
40 inline char first_non_ascii_char(const char *label) {
41  for (int i = 0; label[i]; ++i) {
42  if (label[i]<0) { // non-ASCII chars are negative (because char is signed)
43  return label[i];
44  }
45  }
46  return 0;
47 }
48 
49 static GB_ERROR export_tree_label(const char *label, FILE *out, LabelQuoting qmode) {
50  // writes a label into the Newick file
51  // label is quoted if necessary
52  // label may be an internal_node_label, a leaf_label or a root_label
54 
55  tree_assert(label);
56 
57  bool accept_non_ascii = qmode&LABEL_ACCEPT_NON_ASCII;
58  char non_ASCII = accept_non_ascii ? 0 : first_non_ascii_char(label);
59 
60  if (non_ASCII) {
61  error = GBS_global_string("non-ASCII character ('%c'=%i) detected in label '%s'",
62  non_ASCII, int(safeCharIndex(non_ASCII)), label);
63  }
64  else {
65  const char *disallowed_chars = " \t\'\"()[]:;,"; // '(' is first problem_char
66  const char *problem_chars = disallowed_chars+4;
67  tree_assert(problem_chars[0] == '(');
68 
69  bool need_quotes = strpbrk(label, disallowed_chars);
70  char used_quote = 0;
71  bool force_quotes = (qmode & LABEL_FORCE_QUOTES);
72 
73  if (force_quotes || need_quotes) {
74  if (qmode&LABEL_SINGLE_QUOTES) used_quote = '\'';
75  else if (qmode&LABEL_DOUBLE_QUOTES) used_quote = '\"';
76  }
77 
78  char *fixed_label;
79  {
80  size_t label_length = strlen(label);
81  fixed_label = ARB_strduplen(label, label_length);
82 
83  if (whole_label_quoted(fixed_label, label_length)) {
84  // if whole label is quoted -> remove quotes
85  memmove(fixed_label, fixed_label+1, label_length-2);
86  fixed_label[label_length-2] = 0;
87  }
88  }
89 
90  if (used_quote) {
91  // replace all problematic characters if requested
92  bool force_replace = (qmode & LABEL_FORCE_REPLACE);
93  if (force_replace) replace_by_underscore(fixed_label, problem_chars);
94 
95  // replace used quote by an '_' if it appears inside label
96  char used_quote_as_string[] = { used_quote, 0 };
97  replace_by_underscore(fixed_label, used_quote_as_string);
98 
99  if (!force_quotes) {
100  need_quotes = strpbrk(fixed_label, disallowed_chars);
101  if (!need_quotes) used_quote = 0; // @@@ fails if both quote-types are used in one name
102  }
103  }
104  else {
105  // unquoted label - always replace all problematic characters by '_'
106  replace_by_underscore(fixed_label, disallowed_chars);
107  }
108 
109  if (used_quote) fputc(used_quote, out);
110  fputs(fixed_label, out);
111  if (used_quote) fputc(used_quote, out);
112 
113  free(fixed_label);
114  }
115 
116  return error;
117 }
118 
119 
120 
121 // documentation of the Newick Format is in ../../SOURCE_TOOLS/docs/newick_doc.html
122 
123 inline void indentTo(int indent, FILE *out) {
124  for (int i = 0; i < indent; i++) {
125  putc(' ', out);
126  putc(' ', out);
127  }
128 }
129 
130 static GB_ERROR export_tree_node_print(GBDATA *gb_main, FILE *out, TreeNode *tree, const char *tree_name,
131  bool pretty, int indent,
132  const TreeLabeler& labeler, bool save_branchlengths,
133  bool save_bootstraps, bool save_groupnames, LabelQuoting qmode)
134 {
135  GB_ERROR error = NULp;
136 
137  if (pretty) indentTo(indent, out);
138 
139  if (tree->is_leaf()) {
140  const char *label = labeler.speciesLabel(gb_main, tree->gb_node, tree, tree_name);
141  error = export_tree_label(label, out, qmode);
142  }
143  else {
144  if (pretty) fputs("(\n", out);
145  else putc('(', out);
146 
147  error = export_tree_node_print(gb_main, out, tree->get_leftson(), tree_name, pretty, indent+1, labeler, save_branchlengths, save_bootstraps, save_groupnames, qmode);
148  if (!error) {
149  if (save_branchlengths) fprintf(out, ":%.5f", tree->leftlen);
150  fputs(",\n", out);
151 
152  error = export_tree_node_print(gb_main, out, tree->get_rightson(), tree_name, pretty, indent+1, labeler, save_branchlengths, save_bootstraps, save_groupnames, qmode);
153  }
154  if (!error) {
155  if (save_branchlengths) fprintf(out, ":%.5f", tree->rightlen);
156  fputc('\n', out);
157 
158  if (pretty) indentTo(indent, out);
159  fputc(')', out);
160 
161  char *bootstrap = NULp;
162  if (save_bootstraps) {
163  double value;
164  switch (tree->parse_bootstrap(value)) {
165  case REMARK_BOOTSTRAP: bootstrap = GBS_global_string_copy("%i", int(value+0.5)); break;
166  case REMARK_OTHER: bootstrap = strdup(tree->get_remark()); break;
167  case REMARK_NONE: break;
168  }
169  }
170 
171  const char *label = NULp;
172  {
173  bool useGroupLabel = save_groupnames && tree->has_group_info();
174  if (useGroupLabel) {
175  const char *group = labeler.groupLabel(gb_main, tree->gb_node, tree, tree_name);
176  if (tree->keelTarget()) {
177  error = GBS_global_string("contains a keeled group named '%s'\n"
178  "(to export a tree, you have to correct all keeled groups)", group);
179  }
180  else {
181  if (bootstrap) label = GBS_global_string("%s:%s", bootstrap, group);
182  else label = group;
183  }
184  }
185  else if (bootstrap) label = bootstrap;
186  }
187 
188  if (label) {
189  arb_assert(!error);
190  error = export_tree_label(label, out, qmode);
191  }
192 
193  free(bootstrap);
194  }
195  }
196 
197  return error;
198 }
199 
200 inline string buildNodeIdentifier(const string& parent_id, int& son_counter) {
201  ++son_counter;
202  if (parent_id.empty()) return GBS_global_string("n_%i", son_counter);
203  return GBS_global_string("%s.%i", parent_id.c_str(), son_counter);
204 }
205 
206 static const char *export_tree_node_print_xml(GBDATA *gb_main, TreeNode *tree, double my_length, const char *tree_name,
207  const TreeLabeler& labeler, bool skip_folded, const string& parent_id, int& parent_son_counter) {
208  const char *error = NULp;
209 
210  if (tree->is_leaf()) {
211  XML_Tag item_tag("ITEM");
212 
213  item_tag.add_attribute("name", buildNodeIdentifier(parent_id, parent_son_counter));
214  item_tag.add_attribute("itemname", labeler.speciesLabel(gb_main, tree->gb_node, tree, tree_name));
215  item_tag.add_attribute("length", GBS_global_string("%.5f", my_length));
216  }
217  else {
218  char *bootstrap = NULp;
219  {
220  double value;
221  switch (tree->parse_bootstrap(value)) {
222  case REMARK_BOOTSTRAP: bootstrap = GBS_global_string_copy("%i", int(value+0.5)); break;
223  case REMARK_OTHER: break; // @@@ other branch-remarks are currently not saved into xml format
224  case REMARK_NONE: break;
225  }
226  }
227 
228  bool folded = false;
229  char *groupname = NULp;
230  if (tree->name) {
231  const char *buf = labeler.speciesLabel(gb_main, tree->gb_node, tree, tree_name);
232  tree_assert(buf);
233  groupname = strdup(buf);
234 
235  GBDATA *gb_grouped = GB_entry(tree->gb_node, "grouped");
236  if (gb_grouped) {
237  folded = GB_read_byte(gb_grouped);
238  }
239  }
240 
241  if (my_length || bootstrap || groupname) {
242  bool hide_this_group = skip_folded && folded; // hide folded groups only if skip_folded is true
243 
244  XML_Tag branch_tag(hide_this_group ? "FOLDED_GROUP" : "BRANCH");
245  string my_id = buildNodeIdentifier(parent_id, parent_son_counter);
246 
247  branch_tag.add_attribute("name", my_id);
248 
249  if (my_length) {
250  branch_tag.add_attribute("length", GBS_global_string("%.5f", my_length));
251  }
252  if (bootstrap) {
253  branch_tag.add_attribute("bootstrap", bootstrap);
254  freenull(bootstrap);
255  }
256  if (groupname) {
257  branch_tag.add_attribute("groupname", groupname);
258  freenull(groupname);
259  if (folded) branch_tag.add_attribute("folded", "1");
260  }
261  else {
262  tree_assert(!folded);
263  }
264 
265  if (hide_this_group) {
266  branch_tag.add_attribute("items_in_group", GBT_count_leafs(tree));
267  }
268  else {
269  int my_son_counter = 0;
270  if (!error) error = export_tree_node_print_xml(gb_main, tree->get_leftson(), tree->leftlen, tree_name, labeler, skip_folded, my_id, my_son_counter);
271  if (!error) error = export_tree_node_print_xml(gb_main, tree->get_rightson(), tree->rightlen, tree_name, labeler, skip_folded, my_id, my_son_counter);
272  }
273  }
274  else {
275  if (!error) error = export_tree_node_print_xml(gb_main, tree->get_leftson(), tree->leftlen, tree_name, labeler, skip_folded, parent_id, parent_son_counter);
276  if (!error) error = export_tree_node_print_xml(gb_main, tree->get_rightson(), tree->rightlen, tree_name, labeler, skip_folded, parent_id, parent_son_counter);
277  }
278  }
279 
280  return error;
281 }
282 
283 GB_ERROR TREE_write_XML(GBDATA *gb_main, const char *db_name, const char *tree_name, const TreeLabeler& labeler, bool skip_folded, const char *path) {
284  GB_ERROR error = NULp;
285  FILE *output = fopen(path, "w");
286 
287  if (!output) error = GB_export_errorf("file '%s' could not be opened for writing", path);
288  else {
289  GB_transaction ta(gb_main);
290 
291  TreeNode *tree = GBT_read_tree(gb_main, tree_name, new SimpleRoot);
292  if (!tree) error = GB_await_error();
293  else {
294  error = GBT_link_tree(tree, gb_main, true, NULp, NULp);
295 
296  if (!error) {
297  GBDATA *tree_cont = GBT_find_tree(gb_main, tree_name);
298  GBDATA *tree_remark = GB_entry(tree_cont, "remark");
299 
300  XML_Document xml_doc("ARB_TREE", "arb_tree.dtd", output);
301 
302  xml_doc.add_attribute("database", db_name);
303  xml_doc.add_attribute("treename", tree_name);
304  xml_doc.add_attribute("export_date", ARB_date_string());
305 
306  if (tree_remark) {
307  char *remark = GB_read_string(tree_remark);
308  XML_Tag remark_tag("COMMENT");
309  XML_Text remark_text(remark);
310  free(remark);
311  }
312 
313  int my_son_counter = 0;
314  error = export_tree_node_print_xml(gb_main, tree, 0.0, tree_name, labeler, skip_folded, "", my_son_counter);
315  }
316  }
317  fclose(output);
318  }
319 
320  return error;
321 }
322 
323 static char *complete_newick_comment(const char *comment) {
324  // ensure that all '[' in 'comment' are closed by corresponding ']' by inserting additional brackets
325 
326  int openBrackets = 0;
327  GBS_strstruct out(strlen(comment)*1.1);
328 
329  for (int o = 0; comment[o]; ++o) {
330  switch (comment[o]) {
331  case '[':
332  openBrackets++;
333  break;
334  case ']':
335  if (openBrackets == 0) { // no brackets opened
336  out.put('['); // insert one to enforce balancing
337  }
338  else {
339  openBrackets--;
340  }
341  break;
342 
343  default:
344  break;
345  }
346  out.put(comment[o]);
347  }
348 
349  while (openBrackets>0) { // close all opened brackets
350  out.put(']');
351  openBrackets--;
352  }
353 
354  tree_assert(openBrackets == 0);
355 
356  return out.release();
357 }
358 
359 GB_ERROR TREE_write_Newick(GBDATA *gb_main, const char *tree_name, const TreeLabeler& labeler, bool save_branchlengths, bool save_bootstraps, bool save_groupnames, bool pretty, LabelQuoting quoteMode, const char *path) {
360  // userland newick exporter
361  // see also: testcode newick exporter in ../../ARBDB/adtree.cxx@NEWICK_EXPORTER
362 
363  GB_ERROR error = NULp;
364  FILE *output = fopen(path, "w");
365 
366  if (!output) error = GBS_global_string("file '%s' could not be opened for writing", path);
367  else {
368  GB_transaction ta(gb_main);
369 
370  TreeNode *tree = GBT_read_tree(gb_main, tree_name, new SimpleRoot);
371  if (!tree) error = GB_await_error();
372  else {
373  error = GBT_link_tree(tree, gb_main, true, NULp, NULp);
374 
375  if (!error) {
376  char *remark = NULp;
377  GBDATA *tree_cont = GBT_find_tree(gb_main, tree_name);
378  GBDATA *tree_remark = GB_entry(tree_cont, "remark");
379 
380  if (tree_remark) {
381  remark = GB_read_string(tree_remark);
382  }
383  {
384  const char *saved_to = GBS_global_string("%s saved to %s", tree_name, path);
385  freeset(remark, GBS_log_action_to(remark, saved_to, true));
386  }
387 
388  if (remark) {
389  char *wellformed = complete_newick_comment(remark);
390 
391  tree_assert(wellformed);
392 
393  fputc('[', output); fputs(wellformed, output); fputs("]\n", output);
394  free(wellformed);
395  }
396  free(remark);
397  if (!error) {
398  error = export_tree_node_print(gb_main, output, tree, tree_name, pretty, 0, labeler, save_branchlengths, save_bootstraps, save_groupnames, quoteMode);
399  }
400  }
401 
402  destroy(tree);
403  }
404 
405  fprintf(output, ";\n");
406  fclose(output);
407 
408  if (error) {
409  GB_unlink_or_warn(path, &error);
410  }
411  }
412 
413  return error;
414 }
415 
416 // --------------------------------------------------------------------------------
417 
418 static void export_tree_node_print_remove(char *str) {
419  int i = 0;
420  while (char c = str[i]) {
421  if (c == '\'' || c == '\"') str[i] = '.';
422  i++;
423  }
424 }
425 
426 static void export_tree_rek(TreeNode *tree, FILE *out, bool export_branchlens, bool dquot) {
427  if (tree->is_leaf()) {
429  fprintf(out,
430  dquot ? " \"%s\" " : " '%s' ",
431  tree->name);
432  }
433  else {
434  fputc('(', out);
435  export_tree_rek(tree->get_leftson(), out, export_branchlens, dquot); if (export_branchlens) fprintf(out, ":%.5f,", tree->leftlen);
436  export_tree_rek(tree->get_rightson(), out, export_branchlens, dquot); if (export_branchlens) fprintf(out, ":%.5f", tree->rightlen);
437  fputc(')', out);
438 
439  if (tree->name) {
441  fprintf(out,
442  dquot ? "\"%s\"" : "'%s'",
443  tree->name);
444  }
445  }
446 }
447 
448 // @@@ maybe replace TREE_export_tree by TREE_write_Newick (need some additional parameters: no comment, trifurcation)
449 
450 GB_ERROR TREE_export_tree(GBDATA *, FILE *out, TreeNode *tree, bool triple_root, bool export_branchlens, bool dquot) {
451  if (triple_root) {
452  TreeNode *one, *two, *three;
453  if (tree->is_leaf()) {
454  return GB_export_error("Tree is too small, minimum 3 nodes");
455  }
456  if (tree->leftson->is_leaf() && tree->rightson->is_leaf()) {
457  return GB_export_error("Tree is too small, minimum 3 nodes");
458  }
459  if (tree->leftson->is_leaf()) {
460  one = tree->get_leftson();
461  two = tree->get_rightson()->get_leftson();
462  three = tree->get_rightson()->get_rightson();
463  }
464  else {
465  one = tree->get_leftson()->get_leftson();
466  two = tree->get_leftson()->get_rightson();
467  three = tree->get_rightson();
468  }
469  fputc('(', out);
470  export_tree_rek(one, out, export_branchlens, dquot); if (export_branchlens) fprintf(out, ":%.5f", 1.0); fputc(',', out);
471  export_tree_rek(two, out, export_branchlens, dquot); if (export_branchlens) fprintf(out, ":%.5f", 1.0); fputc(',', out);
472  export_tree_rek(three, out, export_branchlens, dquot); if (export_branchlens) fprintf(out, ":%.5f", 1.0);
473  fputc(')', out);
474  }
475  else {
476  export_tree_rek(tree, out, export_branchlens, dquot);
477  }
478  return NULp;
479 }
480 
481 
#define arb_assert(cond)
Definition: arb_assert.h:245
const char * GB_ERROR
Definition: arb_core.h:25
GB_ERROR TREE_export_tree(GBDATA *, FILE *out, TreeNode *tree, bool triple_root, bool export_branchlens, bool dquot)
Definition: TreeWrite.cxx:450
size_t GBT_count_leafs(const TreeNode *tree)
Definition: adtree.cxx:842
void indentTo(int indent, FILE *out)
Definition: TreeWrite.cxx:123
GB_ERROR TREE_write_XML(GBDATA *gb_main, const char *db_name, const char *tree_name, const TreeLabeler &labeler, bool skip_folded, const char *path)
Definition: TreeWrite.cxx:283
GBT_RemarkType parse_bootstrap(double &bootstrap) const
Definition: TreeNode.h:302
virtual const char * speciesLabel(GBDATA *gb_main, GBDATA *gb_species, TreeNode *species, const char *tree_name) const =0
CONSTEXPR_INLINE unsigned char safeCharIndex(char c)
Definition: dupstr.h:73
bool has_group_info() const
Definition: TreeNode.h:444
void GB_unlink_or_warn(const char *path, GB_ERROR *error)
Definition: arb_file.cxx:206
static const char * export_tree_node_print_xml(GBDATA *gb_main, TreeNode *tree, double my_length, const char *tree_name, const TreeLabeler &labeler, bool skip_folded, const string &parent_id, int &parent_son_counter)
Definition: TreeWrite.cxx:206
const char * ARB_date_string()
Definition: arb_string.cxx:35
TreeNode * GBT_read_tree(GBDATA *gb_main, const char *tree_name, TreeRoot *troot)
Definition: adtree.cxx:837
const char * GBS_global_string(const char *templat,...)
Definition: arb_msg.cxx:203
STL namespace.
char * release()
Definition: arb_strbuf.h:129
GB_ERROR GBT_link_tree(TreeNode *tree, GBDATA *gb_main, bool show_status, int *zombies, int *duplicates)
Definition: adtree.cxx:953
GBT_LEN leftlen
Definition: TreeNode.h:172
TreeNode * rightson
Definition: TreeNode.h:171
GB_ERROR GB_export_error(const char *error)
Definition: arb_msg.cxx:257
GB_ERROR GB_await_error()
Definition: arb_msg.cxx:342
char * ARB_strduplen(const char *p, unsigned len)
Definition: arb_string.h:33
string buildNodeIdentifier(const string &parent_id, int &son_counter)
Definition: TreeWrite.cxx:200
const char * speciesLabel(GBDATA *, GBDATA *, TreeNode *species, const char *) const OVERRIDE
Definition: TreeWrite.cxx:33
static void export_tree_node_print_remove(char *str)
Definition: TreeWrite.cxx:418
static int group[MAXN+1]
Definition: ClustalV.cxx:65
static void error(const char *msg)
Definition: mkptypes.cxx:96
void replace_by_underscore(char *str, const char *toReplace)
Definition: TreeWrite.cxx:21
fputc('\n', stderr)
static char * complete_newick_comment(const char *comment)
Definition: TreeWrite.cxx:323
TreeNode * leftson
Definition: TreeNode.h:171
GB_ERROR TREE_write_Newick(GBDATA *gb_main, const char *tree_name, const TreeLabeler &labeler, bool save_branchlengths, bool save_bootstraps, bool save_groupnames, bool pretty, LabelQuoting quoteMode, const char *path)
Definition: TreeWrite.cxx:359
char * GBS_log_action_to(const char *comment, const char *action, bool stamp)
Definition: adstring.cxx:976
GBT_LEN rightlen
Definition: TreeNode.h:172
a xml text node
Definition: xml.hxx:122
fputs(TRACE_PREFIX, stderr)
GB_ERROR GB_export_errorf(const char *templat,...)
Definition: arb_msg.cxx:262
bool is_leaf() const
Definition: TreeNode.h:211
const char * groupLabel(GBDATA *, GBDATA *, TreeNode *species, const char *) const OVERRIDE
Definition: TreeWrite.cxx:36
Definition: output.h:122
bool whole_label_quoted(const char *label, size_t length)
Definition: TreeWrite.cxx:30
char * name
Definition: TreeNode.h:174
int GB_read_byte(GBDATA *gbd)
Definition: arbdb.cxx:734
static void export_tree_rek(TreeNode *tree, FILE *out, bool export_branchlens, bool dquot)
Definition: TreeWrite.cxx:426
char * GB_read_string(GBDATA *gbd)
Definition: arbdb.cxx:909
static GB_ERROR export_tree_node_print(GBDATA *gb_main, FILE *out, TreeNode *tree, const char *tree_name, bool pretty, int indent, const TreeLabeler &labeler, bool save_branchlengths, bool save_bootstraps, bool save_groupnames, LabelQuoting qmode)
Definition: TreeWrite.cxx:130
static GB_ERROR export_tree_label(const char *label, FILE *out, LabelQuoting qmode)
Definition: TreeWrite.cxx:49
#define NULp
Definition: cxxforward.h:116
LabelQuoting
Definition: TreeWrite.h:18
bool isQuoteChar(char c)
Definition: TreeWrite.cxx:29
TreeNode * keelTarget()
Definition: TreeNode.h:448
GBDATA * GBT_find_tree(GBDATA *gb_main, const char *tree_name)
Definition: adtree.cxx:993
GB_transaction ta(gb_var)
void destroy(TreeNode *that)
Definition: TreeNode.h:600
GBDATA * gb_node
Definition: TreeNode.h:173
GBDATA * gb_main
Definition: adname.cxx:32
const char * get_remark() const
Definition: TreeNode.h:307
size_t length
char first_non_ascii_char(const char *label)
Definition: TreeWrite.cxx:40
GBDATA * GB_entry(GBDATA *father, const char *key)
Definition: adquery.cxx:334
Definition: output.h:28
char * GBS_global_string_copy(const char *templat,...)
Definition: arb_msg.cxx:194
const char * label
void put(char c)
Definition: arb_strbuf.h:174
#define tree_assert(cond)
Definition: TreeWrite.cxx:19