ARB
TreeWrite.cxx
Go to the documentation of this file.
1 // =============================================================== //
2 // //
3 // File : TreeWrite.cxx //
4 // Purpose : //
5 // //
6 // Institute of Microbiology (Technical University Munich) //
7 // http://www.arb-home.de/ //
8 // //
9 // =============================================================== //
10 
11 #include <TreeWrite.h>
12 #include <TreeNode.h>
13 #include <arb_strbuf.h>
14 #include <arb_file.h>
15 #include <xml.hxx>
16 
17 using namespace std;
18 
19 #define tree_assert(cond) arb_assert(cond)
20 
21 inline void replace_by_underscore(char *str, const char *toReplace) {
22  char *unwanted = strpbrk(str, toReplace);
23  while (unwanted) {
24  unwanted[0] = '_';
25  unwanted = strpbrk(unwanted+1, toReplace);
26  }
27 }
28 
29 inline bool isQuoteChar(char c) { return c == '"' || c == '\''; }
30 inline bool whole_label_quoted(const char *label, size_t length) { return isQuoteChar(label[0]) && label[0] == label[length-1]; }
31 
32 static void export_tree_label(const char *label, FILE *out, TREE_node_quoting qmode) {
33  // writes a label into the Newick file
34  // label is quoted if necessary
35  // label may be an internal_node_label, a leaf_label or a root_label
36  tree_assert(label);
37 
38  const char *disallowed_chars = " \t\'\"()[]:;,"; // '(' is first problem_char
39  const char *problem_chars = disallowed_chars+4;
40  tree_assert(problem_chars[0] == '(');
41 
42  bool need_quotes = strpbrk(label, disallowed_chars);
43  char used_quote = 0;
44  bool force_quotes = (qmode & TREE_FORCE_QUOTES);
45 
46  if (force_quotes || need_quotes) {
47  if (qmode&TREE_SINGLE_QUOTES) used_quote = '\'';
48  else if (qmode&TREE_DOUBLE_QUOTES) used_quote = '\"';
49  }
50 
51  char *fixed_label;
52  {
53  size_t label_length = strlen(label);
54  fixed_label = ARB_strduplen(label, label_length);
55 
56  if (whole_label_quoted(fixed_label, label_length)) {
57  // if whole label is quoted -> remove quotes
58  memmove(fixed_label, fixed_label+1, label_length-2);
59  fixed_label[label_length-2] = 0;
60  }
61  }
62 
63  if (used_quote) {
64  // replace all problematic characters if requested
65  bool force_replace = (qmode & TREE_FORCE_REPLACE);
66  if (force_replace) replace_by_underscore(fixed_label, problem_chars);
67 
68  // replace used quote by an '_' if it appears inside label
69  char used_quote_as_string[] = { used_quote, 0 };
70  replace_by_underscore(fixed_label, used_quote_as_string);
71 
72  if (!force_quotes) {
73  need_quotes = strpbrk(fixed_label, disallowed_chars);
74  if (!need_quotes) used_quote = 0; // @@@ fails if both quote-types are used in one name
75  }
76  }
77  else {
78  // unquoted label - always replace all problematic characters by '_'
79  replace_by_underscore(fixed_label, disallowed_chars);
80  }
81 
82  if (used_quote) fputc(used_quote, out);
83  fputs(fixed_label, out);
84  if (used_quote) fputc(used_quote, out);
85 
86  free(fixed_label);
87 }
88 
89 
90 
91 // documentation of the Newick Format is in ../SOURCE_TOOLS/docs/newick_doc.html
92 
93 inline void indentTo(int indent, FILE *out) {
94  for (int i = 0; i < indent; i++) {
95  putc(' ', out);
96  putc(' ', out);
97  }
98 }
99 
100 static const char *export_tree_node_print(GBDATA *gb_main, FILE *out, TreeNode *tree, const char *tree_name,
101  bool pretty, int indent,
102  const TREE_node_text_gen *node_gen, bool save_branchlengths,
103  bool save_bootstraps, bool save_groupnames, TREE_node_quoting qmode)
104 {
105  const char *error = NULp;
106 
107  if (pretty) indentTo(indent, out);
108 
109  if (tree->is_leaf()) {
110  const char *label;
111  if (node_gen) label = node_gen->gen(gb_main, tree->gb_node, NDS_OUTPUT_LEAFTEXT, tree, tree_name);
112  else label = tree->name;
113 
114  export_tree_label(label, out, qmode);
115  }
116  else {
117  if (pretty) fputs("(\n", out);
118  else putc('(', out);
119 
120  error = export_tree_node_print(gb_main, out, tree->get_leftson(), tree_name, pretty, indent+1, node_gen, save_branchlengths, save_bootstraps, save_groupnames, qmode);
121  if (error) return error;
122 
123  if (save_branchlengths) fprintf(out, ":%.5f", tree->leftlen);
124  fputs(",\n", out);
125 
126  error = export_tree_node_print(gb_main, out, tree->get_rightson(), tree_name, pretty, indent+1, node_gen, save_branchlengths, save_bootstraps, save_groupnames, qmode);
127  if (error) return error;
128 
129  if (save_branchlengths) fprintf(out, ":%.5f", tree->rightlen);
130  fputc('\n', out);
131 
132  if (pretty) indentTo(indent, out);
133  fputc(')', out);
134 
135  char *bootstrap = NULp;
136  if (save_bootstraps) {
137  double value;
138  switch (tree->parse_bootstrap(value)) {
139  case REMARK_BOOTSTRAP: bootstrap = GBS_global_string_copy("%i", int(value+0.5)); break;
140  case REMARK_OTHER: bootstrap = strdup(tree->get_remark()); break;
141  case REMARK_NONE: break;
142  }
143  }
144 
145  const char *group = (save_groupnames && tree->has_group_info()) ? tree->name : NULp;
146  const char *label = NULp;
147  if (group) {
148  if (tree->keelTarget()) {
149  error = GBS_global_string("contains a keeled group named '%s'\n"
150  "(to export a tree, you have to correct all keeled groups)", group);
151  }
152  else {
153  if (bootstrap) label = GBS_global_string("%s:%s", bootstrap, group);
154  else label = group;
155  }
156  }
157  else if (bootstrap) label = bootstrap;
158 
159  if (label) export_tree_label(label, out, qmode);
160 
161  free(bootstrap);
162  }
163 
164  return error;
165 }
166 
167 inline string buildNodeIdentifier(const string& parent_id, int& son_counter) {
168  ++son_counter;
169  if (parent_id.empty()) return GBS_global_string("n_%i", son_counter);
170  return GBS_global_string("%s.%i", parent_id.c_str(), son_counter);
171 }
172 
173 static const char *export_tree_node_print_xml(GBDATA *gb_main, TreeNode *tree, double my_length, const char *tree_name,
174  const TREE_node_text_gen *node_gen, bool skip_folded, const string& parent_id, int& parent_son_counter) {
175  const char *error = NULp;
176 
177  if (tree->is_leaf()) {
178  XML_Tag item_tag("ITEM");
179 
180  item_tag.add_attribute("name", buildNodeIdentifier(parent_id, parent_son_counter));
181 
182  item_tag.add_attribute("itemname",
183  node_gen
184  ? node_gen->gen(gb_main, tree->gb_node, NDS_OUTPUT_LEAFTEXT, tree, tree_name)
185  : tree->name);
186 
187  item_tag.add_attribute("length", GBS_global_string("%.5f", my_length));
188  }
189  else {
190  char *bootstrap = NULp;
191  {
192  double value;
193  switch (tree->parse_bootstrap(value)) {
194  case REMARK_BOOTSTRAP: bootstrap = GBS_global_string_copy("%i", int(value+0.5)); break;
195  case REMARK_OTHER: break; // @@@ other branch-remarks are currently not saved into xml format
196  case REMARK_NONE: break;
197  }
198  }
199 
200  bool folded = false;
201  char *groupname = NULp;
202  if (tree->name) {
203  const char *buf;
204 
205  if (node_gen) buf = node_gen->gen(gb_main, tree->gb_node, NDS_OUTPUT_LEAFTEXT, tree, tree_name);
206  else buf = tree->name;
207 
208  tree_assert(buf);
209  groupname = strdup(buf);
210 
211  GBDATA *gb_grouped = GB_entry(tree->gb_node, "grouped");
212  if (gb_grouped) {
213  folded = GB_read_byte(gb_grouped);
214  }
215  }
216 
217  if (my_length || bootstrap || groupname) {
218  bool hide_this_group = skip_folded && folded; // hide folded groups only if skip_folded is true
219 
220  XML_Tag branch_tag(hide_this_group ? "FOLDED_GROUP" : "BRANCH");
221  string my_id = buildNodeIdentifier(parent_id, parent_son_counter);
222 
223  branch_tag.add_attribute("name", my_id);
224 
225  if (my_length) {
226  branch_tag.add_attribute("length", GBS_global_string("%.5f", my_length));
227  }
228  if (bootstrap) {
229  branch_tag.add_attribute("bootstrap", bootstrap);
230  freenull(bootstrap);
231  }
232  if (groupname) {
233  branch_tag.add_attribute("groupname", groupname);
234  freenull(groupname);
235  if (folded) branch_tag.add_attribute("folded", "1");
236  }
237  else {
238  tree_assert(!folded);
239  }
240 
241  if (hide_this_group) {
242  branch_tag.add_attribute("items_in_group", GBT_count_leafs(tree));
243  }
244  else {
245  int my_son_counter = 0;
246  if (!error) error = export_tree_node_print_xml(gb_main, tree->get_leftson(), tree->leftlen, tree_name, node_gen, skip_folded, my_id, my_son_counter);
247  if (!error) error = export_tree_node_print_xml(gb_main, tree->get_rightson(), tree->rightlen, tree_name, node_gen, skip_folded, my_id, my_son_counter);
248  }
249  }
250  else {
251  if (!error) error = export_tree_node_print_xml(gb_main, tree->get_leftson(), tree->leftlen, tree_name, node_gen, skip_folded, parent_id, parent_son_counter);
252  if (!error) error = export_tree_node_print_xml(gb_main, tree->get_rightson(), tree->rightlen, tree_name, node_gen, skip_folded, parent_id, parent_son_counter);
253  }
254  }
255 
256  return error;
257 }
258 
259 GB_ERROR TREE_write_XML(GBDATA *gb_main, const char *db_name, const char *tree_name, const TREE_node_text_gen *node_gen, bool skip_folded, const char *path) {
260  GB_ERROR error = NULp;
261  FILE *output = fopen(path, "w");
262 
263  if (!output) error = GB_export_errorf("file '%s' could not be opened for writing", path);
264  else {
265  GB_transaction ta(gb_main);
266 
267  TreeNode *tree = GBT_read_tree(gb_main, tree_name, new SimpleRoot);
268  if (!tree) error = GB_await_error();
269  else {
270  error = GBT_link_tree(tree, gb_main, true, NULp, NULp);
271  if (!error && node_gen) node_gen->init(gb_main);
272 
273  if (!error) {
274  GBDATA *tree_cont = GBT_find_tree(gb_main, tree_name);
275  GBDATA *tree_remark = GB_entry(tree_cont, "remark");
276 
277  XML_Document xml_doc("ARB_TREE", "arb_tree.dtd", output);
278 
279  xml_doc.add_attribute("database", db_name);
280  xml_doc.add_attribute("treename", tree_name);
281  xml_doc.add_attribute("export_date", ARB_date_string());
282 
283  if (tree_remark) {
284  char *remark = GB_read_string(tree_remark);
285  XML_Tag remark_tag("COMMENT");
286  XML_Text remark_text(remark);
287  free(remark);
288  }
289 
290  int my_son_counter = 0;
291  error = export_tree_node_print_xml(gb_main, tree, 0.0, tree_name, node_gen, skip_folded, "", my_son_counter);
292  }
293  }
294  fclose(output);
295  }
296 
297  return error;
298 }
299 
300 static char *complete_newick_comment(const char *comment) {
301  // ensure that all '[' in 'comment' are closed by corresponding ']' by inserting additional brackets
302 
303  int openBrackets = 0;
304  GBS_strstruct *out = GBS_stropen(strlen(comment)*1.1);
305 
306  for (int o = 0; comment[o]; ++o) {
307  switch (comment[o]) {
308  case '[':
309  openBrackets++;
310  break;
311  case ']':
312  if (openBrackets == 0) {
313  GBS_chrcat(out, '['); // insert one
314  }
315  else {
316  openBrackets--;
317  }
318  break;
319 
320  default:
321  break;
322  }
323  GBS_chrcat(out, comment[o]);
324  }
325 
326  while (openBrackets>0) {
327  GBS_chrcat(out, ']'); // insert one
328  openBrackets--;
329  }
330 
331  tree_assert(openBrackets == 0);
332 
333  return GBS_strclose(out);
334 }
335 
336 GB_ERROR TREE_write_Newick(GBDATA *gb_main, const char *tree_name, const TREE_node_text_gen *node_gen, bool save_branchlengths, bool save_bootstraps, bool save_groupnames, bool pretty, TREE_node_quoting quoteMode, const char *path) {
337  // userland newick exporter
338  // see also: testcode newick exporter in ../../ARBDB/adtree.cxx@NEWICK_EXPORTER
339 
340  GB_ERROR error = NULp;
341  FILE *output = fopen(path, "w");
342 
343  if (!output) error = GBS_global_string("file '%s' could not be opened for writing", path);
344  else {
345  GB_transaction ta(gb_main);
346 
347  TreeNode *tree = GBT_read_tree(gb_main, tree_name, new SimpleRoot);
348  if (!tree) error = GB_await_error();
349  else {
350  error = GBT_link_tree(tree, gb_main, true, NULp, NULp);
351  if (!error && node_gen) node_gen->init(gb_main);
352 
353  if (!error) {
354  char *remark = NULp;
355  GBDATA *tree_cont = GBT_find_tree(gb_main, tree_name);
356  GBDATA *tree_remark = GB_entry(tree_cont, "remark");
357 
358  if (tree_remark) {
359  remark = GB_read_string(tree_remark);
360  }
361  {
362  const char *saved_to = GBS_global_string("%s saved to %s", tree_name, path);
363  freeset(remark, GBS_log_action_to(remark, saved_to, true));
364  }
365 
366  if (remark) {
367  char *wellformed = complete_newick_comment(remark);
368 
369  tree_assert(wellformed);
370 
371  fputc('[', output); fputs(wellformed, output); fputs("]\n", output);
372  free(wellformed);
373  }
374  free(remark);
375  if (!error) {
376  error = export_tree_node_print(gb_main, output, tree, tree_name, pretty, 0, node_gen, save_branchlengths, save_bootstraps, save_groupnames, quoteMode);
377  }
378  }
379 
380  destroy(tree);
381  }
382 
383  fprintf(output, ";\n");
384  fclose(output);
385 
386  if (error) {
387  GB_unlink_or_warn(path, &error);
388  }
389  }
390 
391  return error;
392 }
393 
394 // --------------------------------------------------------------------------------
395 
396 static void export_tree_node_print_remove(char *str) {
397  int i = 0;
398  while (char c = str[i]) {
399  if (c == '\'' || c == '\"') str[i] = '.';
400  i++;
401  }
402 }
403 
404 static void export_tree_rek(TreeNode *tree, FILE *out, bool export_branchlens, bool dquot) {
405  if (tree->is_leaf()) {
407  fprintf(out,
408  dquot ? " \"%s\" " : " '%s' ",
409  tree->name);
410  }
411  else {
412  fputc('(', out);
413  export_tree_rek(tree->get_leftson(), out, export_branchlens, dquot); if (export_branchlens) fprintf(out, ":%.5f,", tree->leftlen);
414  export_tree_rek(tree->get_rightson(), out, export_branchlens, dquot); if (export_branchlens) fprintf(out, ":%.5f", tree->rightlen);
415  fputc(')', out);
416 
417  if (tree->name) {
419  fprintf(out,
420  dquot ? "\"%s\"" : "'%s'",
421  tree->name);
422  }
423  }
424 }
425 
426 #if defined(WARN_TODO)
427 #warning maybe replace TREE_export_tree by TREE_write_Newick
428 // need some additional parameters (no comment, trifurcation)
429 #endif
430 
431 GB_ERROR TREE_export_tree(GBDATA *, FILE *out, TreeNode *tree, bool triple_root, bool export_branchlens, bool dquot) {
432  if (triple_root) {
433  TreeNode *one, *two, *three;
434  if (tree->is_leaf()) {
435  return GB_export_error("Tree is too small, minimum 3 nodes");
436  }
437  if (tree->leftson->is_leaf() && tree->rightson->is_leaf()) {
438  return GB_export_error("Tree is too small, minimum 3 nodes");
439  }
440  if (tree->leftson->is_leaf()) {
441  one = tree->get_leftson();
442  two = tree->get_rightson()->get_leftson();
443  three = tree->get_rightson()->get_rightson();
444  }
445  else {
446  one = tree->get_leftson()->get_leftson();
447  two = tree->get_leftson()->get_rightson();
448  three = tree->get_rightson();
449  }
450  fputc('(', out);
451  export_tree_rek(one, out, export_branchlens, dquot); if (export_branchlens) fprintf(out, ":%.5f", 1.0); fputc(',', out);
452  export_tree_rek(two, out, export_branchlens, dquot); if (export_branchlens) fprintf(out, ":%.5f", 1.0); fputc(',', out);
453  export_tree_rek(three, out, export_branchlens, dquot); if (export_branchlens) fprintf(out, ":%.5f", 1.0);
454  fputc(')', out);
455  }
456  else {
457  export_tree_rek(tree, out, export_branchlens, dquot);
458  }
459  return NULp;
460 }
static void export_tree_label(const char *label, FILE *out, TREE_node_quoting qmode)
Definition: TreeWrite.cxx:32
const char * GB_ERROR
Definition: arb_core.h:25
GB_ERROR TREE_export_tree(GBDATA *, FILE *out, TreeNode *tree, bool triple_root, bool export_branchlens, bool dquot)
Definition: TreeWrite.cxx:431
size_t GBT_count_leafs(const TreeNode *tree)
Definition: adtree.cxx:796
void indentTo(int indent, FILE *out)
Definition: TreeWrite.cxx:93
GBT_RemarkType parse_bootstrap(double &bootstrap) const
Definition: TreeNode.h:261
bool has_group_info() const
Definition: TreeNode.h:403
GB_ERROR TREE_write_XML(GBDATA *gb_main, const char *db_name, const char *tree_name, const TREE_node_text_gen *node_gen, bool skip_folded, const char *path)
Definition: TreeWrite.cxx:259
void GB_unlink_or_warn(const char *path, GB_ERROR *error)
Definition: arb_file.cxx:206
const char * ARB_date_string()
Definition: arb_string.cxx:35
TreeNode * GBT_read_tree(GBDATA *gb_main, const char *tree_name, TreeRoot *troot)
Definition: adtree.cxx:791
const char * GBS_global_string(const char *templat,...)
Definition: arb_msg.cxx:204
STL namespace.
GB_ERROR GBT_link_tree(TreeNode *tree, GBDATA *gb_main, bool show_status, int *zombies, int *duplicates)
Definition: adtree.cxx:907
GBT_LEN leftlen
Definition: TreeNode.h:132
TreeNode * rightson
Definition: TreeNode.h:131
GB_ERROR GB_export_error(const char *error)
Definition: arb_msg.cxx:259
GBS_strstruct * GBS_stropen(long init_size)
Definition: arb_strbuf.cxx:39
GB_ERROR GB_await_error()
Definition: arb_msg.cxx:353
TREE_node_quoting
Definition: TreeWrite.h:34
char * ARB_strduplen(const char *p, unsigned len)
Definition: arb_string.h:33
string buildNodeIdentifier(const string &parent_id, int &son_counter)
Definition: TreeWrite.cxx:167
static void export_tree_node_print_remove(char *str)
Definition: TreeWrite.cxx:396
static int group[MAXN+1]
Definition: ClustalV.cxx:65
GB_ERROR TREE_write_Newick(GBDATA *gb_main, const char *tree_name, const TREE_node_text_gen *node_gen, bool save_branchlengths, bool save_bootstraps, bool save_groupnames, bool pretty, TREE_node_quoting quoteMode, const char *path)
Definition: TreeWrite.cxx:336
static const char * export_tree_node_print(GBDATA *gb_main, FILE *out, TreeNode *tree, const char *tree_name, bool pretty, int indent, const TREE_node_text_gen *node_gen, bool save_branchlengths, bool save_bootstraps, bool save_groupnames, TREE_node_quoting qmode)
Definition: TreeWrite.cxx:100
static void error(const char *msg)
Definition: mkptypes.cxx:96
void replace_by_underscore(char *str, const char *toReplace)
Definition: TreeWrite.cxx:21
fputc('\n', stderr)
GB_write_int const char GB_write_autoconv_string WRITE_SKELETON(write_pointer, GBDATA *,"%p", GB_write_pointer) char *AW_awa if)(!gb_var) return strdup("")
Definition: AW_awar.cxx:166
static char * complete_newick_comment(const char *comment)
Definition: TreeWrite.cxx:300
TREE_make_node_text gen
Definition: TreeWrite.h:26
TreeNode * leftson
Definition: TreeNode.h:131
char * GBS_log_action_to(const char *comment, const char *action, bool stamp)
Definition: adstring.cxx:990
GBT_LEN rightlen
Definition: TreeNode.h:132
a xml text node
Definition: xml.hxx:122
void GBS_chrcat(GBS_strstruct *strstr, char ch)
Definition: arb_strbuf.cxx:119
fputs(TRACE_PREFIX, stderr)
GB_ERROR GB_export_errorf(const char *templat,...)
Definition: arb_msg.cxx:264
bool is_leaf() const
Definition: TreeNode.h:171
Definition: output.h:122
bool whole_label_quoted(const char *label, size_t length)
Definition: TreeWrite.cxx:30
char * name
Definition: TreeNode.h:134
int GB_read_byte(GBDATA *gbd)
Definition: arbdb.cxx:728
static void export_tree_rek(TreeNode *tree, FILE *out, bool export_branchlens, bool dquot)
Definition: TreeWrite.cxx:404
char * GB_read_string(GBDATA *gbd)
Definition: arbdb.cxx:903
char * GBS_strclose(GBS_strstruct *strstr)
Definition: arb_strbuf.cxx:69
#define NULp
Definition: cxxforward.h:97
bool isQuoteChar(char c)
Definition: TreeWrite.cxx:29
TREE_make_node_text_init init
Definition: TreeWrite.h:25
TreeNode * keelTarget()
Definition: TreeNode.h:407
GBDATA * GBT_find_tree(GBDATA *gb_main, const char *tree_name)
Definition: adtree.cxx:947
GB_transaction ta(gb_var)
void destroy(TreeNode *that)
Definition: TreeNode.h:559
GBDATA * gb_node
Definition: TreeNode.h:133
GBDATA * gb_main
Definition: adname.cxx:33
const char * get_remark() const
Definition: TreeNode.h:266
size_t length
static const char * export_tree_node_print_xml(GBDATA *gb_main, TreeNode *tree, double my_length, const char *tree_name, const TREE_node_text_gen *node_gen, bool skip_folded, const string &parent_id, int &parent_son_counter)
Definition: TreeWrite.cxx:173
GBDATA * GB_entry(GBDATA *father, const char *key)
Definition: adquery.cxx:334
Definition: output.h:28
char * GBS_global_string_copy(const char *templat,...)
Definition: arb_msg.cxx:195
const char * label
#define tree_assert(cond)
Definition: TreeWrite.cxx:19