ARB
seq_export.cxx
Go to the documentation of this file.
1 // ============================================================= //
2 // //
3 // File : seq_export.cxx //
4 // Purpose : //
5 // //
6 // Institute of Microbiology (Technical University Munich) //
7 // http://www.arb-home.de/ //
8 // //
9 // ============================================================= //
10 
11 #include "seqio.hxx"
12 
13 #include <AP_filter.hxx>
14 #include <xferset.h>
15 
16 #include <arbdbt.h>
17 #include <gb_aci.h>
18 
19 #include <arb_strarray.h>
20 #include <arb_file.h>
21 #include <arb_diff.h>
22 #include <arb_progress.h>
23 #include <arb_global_defs.h>
24 
25 #include <xml.hxx>
26 
27 #include <unistd.h>
28 
29 #define sio_assert(cond) arb_assert(cond)
30 
31 using std::string;
32 using namespace SEQIO;
33 using namespace FieldTransfer;
34 
35 // ---------------------------------
36 // internal export commands
37 
38 enum EXPORT_CMD {
39  // real formats
41 
43  EXPORT_USING_FORM, // default mode (has to be last entry in enum)
44 };
45 
46 static const char *internal_export_commands[] = {
47  "xml_write",
48  NULp
49 };
50 
51 static EXPORT_CMD check_internal(const char *command) {
53  for (int i = 0; internal_export_commands[i]; ++i) {
54  if (strcmp(command, internal_export_commands[i]) == 0) {
55  cmd = static_cast<EXPORT_CMD>(i);
56  }
57  }
58  return cmd;
59 }
60 
61 // ----------------------
62 // export_format
63 
64 struct export_format : virtual Noncopyable {
65  char *system;
66  char *pre_format;
67  char *suffix;
68  char *description; // (multiline) description of filter
69  char *form; // transformed export expression (part behind 'BEGIN')
70 
72 
74  : system(NULp),
75  pre_format(NULp),
76  suffix(NULp),
77  description(NULp),
78  form(NULp),
79  export_mode(EXPORT_XML)
80  {}
82  free(system);
83  free(pre_format);
84  free(suffix);
85  free(description);
86  free(form);
87  }
88 };
89 
90 static GB_ERROR read_export_format(export_format *efo, const char *file, bool load_complete_form) {
92 
93  if (!file || !file[0]) {
94  error = "No export format selected";
95  }
96  else {
97  char *fullfile = NULp;
98  if (GB_is_regularfile(file)) { // prefer files that are completely specified (full/rel path)
99  fullfile = strdup(GB_canonical_path(file));
100  }
101  else {
102  fullfile = nulldup(GB_path_in_ARBHOME(file)); // fallback to ARBHOME-relative specification
103  }
104 
105  FILE *in = fopen(fullfile, "r");
106 
107  if (!in) error = GB_IO_error("reading export form", fullfile);
108  else {
109  efo->export_mode = EXPORT_USING_FORM; // default mode
110  {
111  bool seen_BEGIN = false;
112  char *s1, *s2;
113  size_t linenumber = 0;
114 
115  while (!error && !seen_BEGIN && read_string_pair(in, s1, s2, linenumber)) {
116  if (!strcmp(s1, "SYSTEM")) { reassign(efo->system, s2); }
117  else if (!strcmp(s1, "PRE_FORMAT")) { reassign(efo->pre_format, s2); }
118  else if (!strcmp(s1, "SUFFIX")) { reassign(efo->suffix, s2); }
119  else if (!strcmp(s1, "DESCRIPTION")) { appendTo(efo->description, '\n', s2); }
120  else if (!strcmp(s1, "INTERNAL")) {
121  efo->export_mode = check_internal(s2);
122  if (efo->export_mode == EXPORT_INVALID) {
123  error = GBS_global_string("Unknown INTERNAL command '%s'", s2);
124  }
125  }
126  else if (!strcmp(s1, "BEGIN")) {
127  if (efo->export_mode != EXPORT_USING_FORM) {
128  error = "'BEGIN' not allowed when 'INTERNAL' is used";
129  }
130  else {
131  seen_BEGIN = true;
132  }
133  }
134  else {
135  error = GBS_global_string("Unknown command '%s'", s1);
136  }
137 
138  // add error location
139  if (error) error = GBS_global_string("%s in line #%zu", error, linenumber);
140 
141  free(s2);
142  free(s1);
143  }
144  }
145 
146  if (!error && load_complete_form && efo->export_mode == EXPORT_USING_FORM) {
147  // now 'in' points to line behind 'BEGIN'
148  char *form = GB_read_fp(in); // read rest of file
149 
150  // Join lines that end with \ with next line.
151  // Replace ' = ' and ':' by '\=' and '\:'
152  efo->form = GBS_string_eval(form, "\\\\\n=:\\==\\\\\\=:*=\\*\\=*1:\\:=\\\\\\:");
153  if (!efo->form) error = GB_failedTo_error("evaluate part below 'BEGIN'", NULp, GB_await_error());
154  free(form);
155  }
156 
157  // some checks for incompatible commands
158  if (!error) {
159  if (efo->system && !efo->pre_format) error = "Missing 'PRE_FORMAT' (needed by 'SYSTEM')";
160  else if (efo->pre_format && !efo->system) error = "Missing 'SYSTEM' (needed by 'PRE_FORMAT')";
161  else if (efo->export_mode != EXPORT_USING_FORM) {
162  if (efo->system) error = "'SYSTEM' is not allowed together with 'INTERNAL'";
163  if (efo->pre_format) error = "'PRE_FORMAT' is not allowed together with 'INTERNAL'";
164  }
165  }
166 
167  error = GB_failedTo_error("read export format", fullfile, error);
168  fclose(in);
169  }
170  free(fullfile);
171  }
172 
173  return error;
174 }
175 
176 // ----------------------------------------
177 // export sequence helper class
178 
179 class SpeciesSelector : virtual Noncopyable {
180  ExportWhich which;
181  const char *one_species;
182 
183 public:
184  SpeciesSelector(ExportWhich which_, const char *one_species_) :
185  which(which_),
186  one_species(one_species_)
187  {}
189  GBDATA *gb_species = NULp;
190  switch (which) {
191  case EBF_ALL: gb_species = GBT_first_species(gb_main); break;
192  case EBF_MARKED: gb_species = GBT_first_marked_species(gb_main); break;
193  case EBF_ONE: gb_species = GBT_find_species(gb_main, one_species); break;
194  }
195  return gb_species;
196  }
197  GBDATA *select_next(GBDATA *gb_previous) const {
198  GBDATA *gb_species = NULp;
199  switch (which) {
200  case EBF_ALL: gb_species = GBT_next_species(gb_previous); break;
201  case EBF_MARKED: gb_species = GBT_next_marked_species(gb_previous); break;
202  case EBF_ONE: break;
203  }
204  return gb_species;
205  }
206 };
207 
208 class export_sequence_data : virtual Noncopyable { // @@@ simplify using FilteredExport?
209  GBDATA *last_species_read;
210  char *seq;
211  size_t len;
212  char *error;
213 
214  GBDATA *gb_main;
215  char *ali;
216 
217  SpeciesSelector whichSpecies;
218 
219  size_t species_count;
220  AP_filter *filter;
221  bool cut_stop_codon;
222  int compress; // 0 = no;1 = vertical gaps; 2 = all gaps;
223 
224  long max_ali_len; // length of alignment
225  size_t *export_column; // list of exported seq data positions
226  size_t columns; // how many columns get exported
227 
228  GBDATA *single_species; // if set to species -> first/next only return this species (used to export to multiple files)
229 
230 public:
231 
232  export_sequence_data(GBDATA *Gb_Main, ExportWhich which, const char *one_species, AP_filter* Filter, bool CutStopCodon, int Compress) :
233  last_species_read(NULp),
234  seq(NULp),
235  len(0),
236  error(NULp),
237  gb_main(Gb_Main),
238  whichSpecies(which, one_species),
239  species_count(size_t(-1)),
240  filter(Filter),
241  cut_stop_codon(CutStopCodon),
242  compress(Compress),
243  export_column(NULp),
244  columns(0),
245  single_species(NULp)
246  {
247  ali = GBT_get_default_alignment(gb_main);
248  max_ali_len = GBT_get_alignment_len(gb_main, ali);
249 
250  if (cut_stop_codon) {
251  GB_alignment_type ali_type = GBT_get_alignment_type(gb_main, ali);
252  if (ali_type != GB_AT_AA) {
253  GB_warning("Cutting stop codon makes no sense - ignored");
254  cut_stop_codon = false;
255  }
256  }
257  sio_assert(filter);
258 
259  if (max_ali_len>=0 && filter->get_length() < size_t(max_ali_len)) {
260  GB_warningf("Warning: Your filter is shorter than the alignment (%zu<%li)",
261  filter->get_length(), max_ali_len);
262  max_ali_len = filter->get_length();
263  }
264  }
265 
267  delete [] export_column;
268  delete [] seq;
269  free(error);
270  free(ali);
271  }
272 
273  const char *getAlignment() const { return ali; }
274  long getAliLen() const { return max_ali_len; }
275  GBDATA *get_gb_main() const { sio_assert(gb_main); return gb_main; }
276 
277  void set_single_mode(GBDATA *gb_species) { single_species = gb_species; }
278  bool in_single_mode() const { return single_species; }
279 
280  GBDATA *first_species() const { return single_species ? single_species : whichSpecies.select_first(gb_main); }
281  GBDATA *next_species(GBDATA *gb_prev) const { return single_species ? NULp : whichSpecies.select_next(gb_prev); }
282 
283  const unsigned char *get_seq_data(GBDATA *gb_species, size_t& slen, GB_ERROR& error) const;
284  static bool isGap(char c) { return GAP::is_std_gap(c); }
285 
286  size_t count_species() {
287  sio_assert(!in_single_mode());
288  if (species_count == size_t(-1)) {
289  species_count = 0;
290  for (GBDATA *gb_species = whichSpecies.select_first(gb_main);
291  gb_species;
292  gb_species = whichSpecies.select_next(gb_species))
293  {
294  species_count++;
295  }
296  }
297  return species_count;
298  }
299 
300  GB_ERROR detectVerticalGaps();
301  const char *get_export_sequence(GBDATA *gb_species, size_t& seq_len, GB_ERROR& error);
302 };
303 
304 const unsigned char *export_sequence_data::get_seq_data(GBDATA *gb_species, size_t& slen, GB_ERROR& err) const {
305  const char *data = NULp;
306  GBDATA *gb_seq = GBT_find_sequence(gb_species, ali);
307 
308  if (!gb_seq) {
309  err = GBS_global_string_copy("No data in alignment '%s' of species '%s'", ali, GBT_get_name_or_description(gb_species));
310  slen = 0;
311  }
312  else {
313  data = GB_read_char_pntr(gb_seq);
314  slen = GB_read_count(gb_seq);
315  err = NULp;
316  }
317  return (const unsigned char *)data;
318 }
319 
320 
322  GB_ERROR err = NULp;
323 
324  sio_assert(!in_single_mode());
325 
326  if (compress == 1) { // compress vertical gaps!
327  // @@@ detection of vertical gaps should better be done either by AP_filter directly or by FilteredExport
328 
329  size_t gap_columns = filter->get_filtered_length();
330  size_t *gap_column = new size_t[gap_columns+1];
331 
332  const size_t *filterpos_2_seqpos = filter->get_filterpos_2_seqpos();
333  memcpy(gap_column, filterpos_2_seqpos, gap_columns*sizeof(*gap_column));
334  gap_column[gap_columns] = max_ali_len;
335 
336  arb_progress progress("Calculating vertical gaps", count_species());
337 
338  for (GBDATA *gb_species = first_species();
339  gb_species && !err;
340  gb_species = next_species(gb_species))
341  {
342  size_t slen;
343  const unsigned char *sdata = get_seq_data(gb_species, slen, err);
344 
345  if (!err) {
346  size_t j = 0;
347  size_t i;
348  for (i = 0; i<gap_columns; ++i) {
349  if (isGap(sdata[gap_column[i]])) {
350  gap_column[j++] = gap_column[i]; // keep gap column
351  }
352  // otherwise it's overwritten
353  }
354 
355  sio_assert(i >= j);
356  size_t skipped_columns = i-j;
357  sio_assert(gap_columns >= skipped_columns);
358  gap_columns -= skipped_columns;
359  }
360  progress.inc_and_check_user_abort(err);
361  }
362 
363  if (!err) {
364  columns = filter->get_filtered_length() - gap_columns;
365  export_column = new size_t[columns];
366 
367  size_t gpos = 0; // index into array of vertical gaps
368  size_t epos = 0; // index into array of exported columns
369  size_t flen = filter->get_filtered_length();
370  size_t a;
371  for (a = 0; a<flen && gpos<gap_columns; ++a) {
372  size_t fpos = filterpos_2_seqpos[a];
373  if (fpos == gap_column[gpos]) { // only gaps here -> skip column
374  gpos++;
375  }
376  else { // not only gaps -> use column
377  sio_assert(fpos<gap_column[gpos]);
378  sio_assert(epos < columns); // got more columns than expected
379  export_column[epos++] = fpos;
380  }
381  }
382  for (; a<flen; ++a) { // LOOP_VECTORIZED
383  export_column[epos++] = filterpos_2_seqpos[a];
384  }
385 
386  sio_assert(epos == columns);
387  }
388 
389  delete [] gap_column;
390  }
391  else { // compress all or none (simply use filter)
392  const size_t *filterpos_2_seqpos = filter->get_filterpos_2_seqpos();
393 
394  columns = filter->get_filtered_length();
395  export_column = new size_t[columns];
396 
397  memcpy(export_column, filterpos_2_seqpos, columns*sizeof(*filterpos_2_seqpos));
398  }
399 
400  seq = new char[columns+1];
401 
402  return err;
403 }
404 
405 const char *export_sequence_data::get_export_sequence(GBDATA *gb_species, size_t& seq_len, GB_ERROR& err) {
406  if (gb_species != last_species_read) {
407  freenull(error);
408 
409  // read + filter a new species
410  GB_ERROR curr_error;
411  const unsigned char *data = get_seq_data(gb_species, len, curr_error);
412 
413  if (curr_error) {
414  error = strdup(curr_error);
415  }
416  else {
417  size_t i;
418  const uchar *simplify = filter->get_simplify_table();
419 
420  if (cut_stop_codon) {
421  const unsigned char *stop_codon = (const unsigned char *)memchr(data, '*', len);
422  if (stop_codon) {
423  len = stop_codon-data;
424  }
425  }
426 
427  if (compress == 2) { // compress all gaps
428  size_t j = 0;
429  for (i = 0; i<columns; ++i) {
430  size_t seq_pos = export_column[i];
431  if (seq_pos<len) {
432  unsigned char c = data[seq_pos];
433  if (!isGap(c)) {
434  seq[j++] = simplify[c];
435  }
436  }
437  }
438  seq[j] = 0;
439  len = j;
440  }
441  else { // compress vertical or compress none (simply use filter in both cases)
442  for (i = 0; i<columns; ++i) {
443  size_t seq_pos = export_column[i];
444  if (seq_pos<len) {
445  seq[i] = simplify[data[seq_pos]];
446  }
447  else {
448  seq[i] = simplify['.'];
449  }
450  }
451  seq[i] = 0;
452  len = columns;
453  }
454  }
455  }
456 
457  err = error;
458  if (error) {
459  seq_len = 0;
460  return NULp;
461  }
462 
463  seq_len = len;
464  return seq;
465 }
466 
467 // ----------------------------------------
468 // exported_sequence is hooked into ACI temporary (provides result of command 'export_sequence')
469 // which is the sequence filtered and compressed according to settings in the export window
470 
472 
473 static const char *exported_sequence(GBDATA *gb_species, size_t *seq_len, GB_ERROR *error) {
474  sio_assert(esd);
475  return esd->get_export_sequence(gb_species, *seq_len, *error);
476 }
477 
478 static GB_ERROR XML_recursive(GBDATA *gbd, int depth) {
479  GB_ERROR error = NULp;
480  const char *key_name = GB_read_key_pntr(gbd);
481  XML_Tag *tag = NULp;
482  bool descend = true;
483 
484  if (depth == 1 && strncmp(key_name, "ali_", 4) == 0) { // hack needed if seq-quality information exists
485  sio_assert(esd);
486  descend = false; // do not descend into alignments
487  if (strcmp(esd->getAlignment(), key_name) == 0) { // the wanted alignment
488 
489  tag = new XML_Tag("ALIGNMENT");
490  tag->add_attribute("name", key_name+4);
491 
492  GBDATA *gb_species = GB_get_father(gbd);
493  size_t len;
494  const char *seq = exported_sequence(gb_species, &len, &error);
495 
496  if (seq) {
497  XML_Tag dtag("data");
498  { XML_Text seqText(seq); }
499  }
500  }
501  }
502  else {
503  tag = new XML_Tag(key_name);
504 
505  if (GB_is_container(gbd)) {
506  const char *name = GBT_read_char_pntr(gbd, "name");
507  if (name) tag->add_attribute("name", name);
508  }
509  }
510 
511  if (descend) {
512  if (GB_read_type(gbd) == GB_DB) {
513  for (GBDATA *gb_child = GB_child(gbd); gb_child && !error; gb_child = GB_nextChild(gb_child)) {
514  const char *sub_key_name = GB_read_key_pntr(gb_child);
515 
516  if (strcmp(sub_key_name, "name") != 0) { // do not recurse for "name" (is handled above)
517  error = XML_recursive(gb_child, depth+1);
518  }
519  }
520  }
521  else {
522  char *content = GB_read_as_string(gbd);
523  if (content) {
524  XML_Text text(content);
525  free(content);
526  }
527  else {
528  tag->add_attribute("error", "unsavable");
529  }
530  }
531  }
532 
533  delete tag;
534  return error;
535 }
536 
537 static GB_ERROR export_species_using_form(FILE *out, const char *form, const GBL_call_env& callEnv) { // @@@ pass preparsed command (form)
538  GB_ERROR error = NULp;
539  char *pars = GBS_string_eval_in_env(" ", form, callEnv);
540  if (!pars) error = GB_await_error();
541  else {
542  char *p;
543  char *o = pars;
544  while ((p = GBS_find_string(o, "$$DELETE_LINE$$", 0))) {
545  char *l, *r;
546  for (l = p; l>o; l--) if (*l=='\n') break;
547  r = strchr(p, '\n'); if (!r) r = p + strlen(p);
548  fwrite(o, 1, l-o, out);
549  o = r;
550  }
551  fputs(o, out);
552  free(pars);
553  }
554  return error;
555 }
556 
557 static GB_ERROR export_write_species(GBDATA *gb_species, FILE *out, const GBL_env& env, const export_format& efo) {
558  GB_ERROR error = NULp;
559  switch (efo.export_mode) {
560  case EXPORT_USING_FORM: {
561  GBL_call_env callEnv(gb_species, env);
562  error = export_species_using_form(out, efo.form, callEnv);
563  break;
564  }
565 
566  case EXPORT_XML:
567  error = XML_recursive(gb_species, 0);
568  break;
569 
570  case EXPORT_INVALID:
571  sio_assert(0);
572  break;
573  }
574  return error;
575 }
576 
577 static GB_ERROR export_format_single(const char *db_name, const char *formname, const char *outname, char **resulting_outname, RuleSetPtr ruleset) {
578  // Exports sequences specified by 'esd' (module global variable)
579  // to format specified by 'formname'.
580  //
581  // if 'outname' == NULp -> export species to temporary file, otherwise to 'outname'.
582  // Full path of generated file is returned in 'resulting_outname'
583 
584  static int export_depth = 0;
585  export_depth++;
586 
587  *resulting_outname = NULp;
588 
589  export_format efo;
590  GB_ERROR error = read_export_format(&efo, formname, true);
591 
592  if (!error) {
593  if (!outname) { // if no 'outname' is given -> export to temporary file
594  char *unique_outname = GB_unique_filename("exported", efo.suffix);
595  *resulting_outname = GB_create_tempfile(unique_outname);
596  free(unique_outname);
597 
598  if (!*resulting_outname) error = GB_await_error();
599  }
600  else *resulting_outname = strdup(outname);
601  }
602 
603  sio_assert(error || *resulting_outname);
604 
605  if (!error) {
606  if (efo.pre_format) {
607  // Export data using format 'pre_format'.
608  // Afterwards convert to wanted format using 'system'.
609 
610  sio_assert(efo.system);
611 
612  char *intermediate_export;
613  error = export_format_single(db_name, efo.pre_format, NULp, &intermediate_export, ruleset);
614  if (!error) {
615  sio_assert(GB_is_privatefile(intermediate_export, false));
616 
617  GB_informationf("Converting to %s", efo.suffix);
618 
619  char *srt = GBS_global_string_copy("$<=%s:$>=%s", intermediate_export, *resulting_outname);
620  char *sys = GBS_string_eval(efo.system, srt);
621 
622  GB_informationf("exec '%s'", efo.system);
623  error = GBK_system(sys);
624 
625  GB_unlink_or_warn(intermediate_export, &error);
626 
627  free(sys);
628  free(srt);
629  }
630  free(intermediate_export);
631  }
632  else {
633  FILE *out = fopen(*resulting_outname, "wt");
634  if (!out) error = GB_IO_error("writing", *resulting_outname);
635  else {
636  XML_Document *xml = NULp;
637 
638  long allCount = 0;
639  for (GBDATA *gb_species = esd->first_species();
640  gb_species && !error;
641  gb_species = esd->next_species(gb_species))
642  {
643  allCount++;
644  }
645 
646  arb_progress progress(allCount);
647  progress.auto_subtitles("Saving species");
648 
649  if (efo.export_mode == EXPORT_XML) {
650  xml = new XML_Document("ARB_SEQ_EXPORT", "arb_seq_export.dtd", out);
651  {
652  xml->add_attribute("database", db_name);
653  }
654  xml->add_attribute("export_date", ARB_date_string());
655  {
656  XML_Comment rem("There is a basic version of ARB_seq_export.dtd in $ARBHOME/lib/dtd\n"
657  "but you might need to expand it by yourself,\n"
658  "because the ARB-database may contain any kind of fields.");
659  }
660  }
661 
662  GBL_env env(esd->get_gb_main(), NULp);
663 
664  for (GBDATA *gb_species = esd->first_species();
665  gb_species && !error;
666  gb_species = esd->next_species(gb_species))
667  {
668  if (ruleset.isSet()) {
669  GB_topSecurityLevel unsecured(env.get_gb_main()); // needed to clone species (overwrites name .. in temporary clone)
671  if (clone.has_error()) {
672  error = clone.get_error();
673  }
674  else {
675  GB_previousSecurityLevel user(unsecured); // run export itself with normal security
676  error = export_write_species(clone.get_clone(), out, env, efo);
677  }
678  }
679  else {
680  error = export_write_species(gb_species, out, env, efo);
681  }
682  progress.inc_and_check_user_abort(error);
683  }
684 
685  delete xml;
686  fclose(out);
687  }
688  }
689  }
690 
691  if (error) {
692  if (*resulting_outname) {
693  GB_unlink_or_warn(*resulting_outname, NULp);
694  freenull(*resulting_outname);
695  }
696  }
697 
698  export_depth--;
699 
700  return error;
701 }
702 
703 static GB_ERROR export_format_multiple(const char* dbname, const char *formname, const char *outname, bool multiple, char **resulting_outname, RuleSetPtr ruleset) {
704  GB_ERROR error = NULp;
705 
706  if (multiple) {
707  char *path, *name, *suffix;
708  GB_split_full_path(outname, &path, NULp, &name, &suffix);
709  *resulting_outname = NULp;
710 
711  arb_progress progress("Exporting data", esd->count_species());
712 
713  for (GBDATA *gb_species = esd->first_species();
714  gb_species && !error;
715  gb_species = esd->next_species(gb_species))
716  {
717  const char *species_name = GBT_read_char_pntr(gb_species, "name");
718  if (!species_name) error = "Can't export unnamed species";
719  else {
720  const char *fname = GB_append_suffix(GBS_global_string("%s_%s", name, species_name), suffix);
721  progress.subtitle(fname);
722 
723  char *oname = strdup(GB_concat_path(path, fname));
724  char *res_oname;
725 
726  esd->set_single_mode(gb_species); // means: only export 'gb_species'
727  error = export_format_single(dbname, formname, oname, &res_oname, ruleset);
728  esd->set_single_mode(NULp);
729 
730  if (!*resulting_outname || // not set yet
731  (res_oname && strcmp(*resulting_outname, res_oname)>0)) // or smaller than set one
732  {
733  reassign(*resulting_outname, res_oname);
734  }
735 
736  free(res_oname);
737  free(oname);
738  }
739 
740  progress.inc_and_check_user_abort(error);
741  }
742 
743  free(suffix);
744  free(name);
745  free(path);
746  }
747  else {
748  arb_progress progress("Exporting data");
749  error = export_format_single(dbname, formname, outname, resulting_outname, ruleset);
750  }
751 
752  return error;
753 }
754 
755 namespace SEQIO {
756 
757  GB_ERROR export_by_format(GBDATA *gb_main, ExportWhich which, const char *one_species,
758  AP_filter *filter, int cut_stop_codon, int compress,
759  const char *dbname, const char *formname, const char *field_transfer_set,
760  const char *outname, int multiple, char **real_outname)
761  {
763 
764  if (field_transfer_set && !field_transfer_set[0]) { // empty 'field_transfer_set' given
765  field_transfer_set = NULp; // -> handle like NULp
766  }
767 
768  GB_ERROR error = filter->is_invalid();
769 
770  RuleSetPtr ruleset;
771  if (!error) {
772  if (field_transfer_set) { // if specified load ruleset:
773  ErrorOrRuleSetPtr loaded = RuleSet::loadFrom(field_transfer_set);
774 
775  if (loaded.hasError()) {
776  ARB_ERROR lerror = loaded.getError();
777  error = lerror.deliver();
778  }
779  else {
780  ruleset = loaded.getValue();
781  }
782  }
783  }
784 
785  if (!error) {
786  esd = new export_sequence_data(gb_main, which, one_species, filter, cut_stop_codon, compress);
787  sio_assert(esd->getAliLen()>0);
788 
790 
791  error = esd->detectVerticalGaps();
792  if (!error) {
793  error = export_format_multiple(dbname, formname, outname, multiple, real_outname, ruleset);
794  if (error) error = GBS_static_string(error); // error is member of export_sequence_data -> copy to static buffer
795  }
796 
798  }
799  delete esd;
800  esd = NULp;
801 
803  return error;
804  }
805 
807  export_format efs;
808  GB_ERROR error = read_export_format(&efs, eft_formname, false);
809 
810  if (!error) {
811  if (efs.suffix) {
812  info.suffix = efs.suffix;
813  efs.suffix = NULp;
814  }
815  if (efs.description) {
816  info.description = efs.description;
817  efs.description = NULp;
818  }
819  }
820 
821  return error;
822  }
823 
824  char *get_exportFormat_evalForm(const char *eft_formname, GB_ERROR& error) {
825  // load copy of form that gets evaluated during export.
826  export_format efs;
827  error = read_export_format(&efs, eft_formname, true);
828  if (!error && efs.form) {
829  if (efs.pre_format) {
830  sio_assert(strcmp(efs.form, "*=") == 0); // caused by eval in read_export_format?
831  return get_exportFormat_evalForm(efs.pre_format, error);
832  }
833 
834  sio_assert(efs.pre_format == NULp);
835  return ARB_strdup(efs.form);
836  }
837  // failed to load form
838 
839  sio_assert(efs.form == NULp);
840  sio_assert(efs.pre_format == NULp);
841  if (!error) {
842  if (efs.export_mode != EXPORT_USING_FORM) {
843  if (efs.export_mode == EXPORT_XML) {
844  error = "exports all fields";
845  }
846  else {
847  error = "unsupported filter type";
848  }
849  }
850  else {
851  error = "no form loaded";
852  }
853  }
854 
855  sio_assert(error);
856  if (error) {
857  char *nameOnly = NULp;
858  GB_split_full_path(eft_formname, NULp, &nameOnly, NULp, NULp);
859 
860  const char *shownName = nameOnly ? nameOnly : eft_formname;
861  error = GBS_global_string("%s (%s)", error, shownName);
862 
863  free(nameOnly);
864  }
865  return NULp;
866  }
867 
868 };
869 
870 // --------------------------------------------------------------------------------
871 
872 #ifdef UNIT_TESTS
873 #include <test_unit.h>
874 
875 // uncomment to auto-update exported files
876 // (needed once after changing database or export formats)
877 // #define TEST_AUTO_UPDATE
878 #define TEST_AUTO_UPDATE_ONLY_MISSING // do auto-update only if file is missing
879 
880 void TEST_sequence_export() {
881  GB_shell shell;
882  arb_suppress_progress silence;
883 
884  GBDATA *gb_main = GB_open("TEST_loadsave.arb", "r");
885  char *export_dir = nulldup(GB_path_in_ARBLIB("export"));
886  StrArray eft;
887  GBS_read_dir(eft, export_dir, "*.eft");
888 
889  AP_filter *filter = NULp;
890  {
891  GB_transaction ta(gb_main);
892 
893  char *ali = GBT_get_default_alignment(gb_main);
894  size_t alilen = GBT_get_alignment_len(gb_main, ali);
895  filter = new AP_filter(alilen);
896 
897  GBT_mark_all(gb_main, 0);
898  GBDATA *gb_species = GBT_find_species(gb_main, "MetMazei");
899  TEST_REJECT_NULL(gb_species);
900 
901  GB_write_flag(gb_species, 1); // mark
902  free(ali);
903  }
904  for (int e = 0; eft[e]; ++e) {
905  for (int complete = 0; complete <= 1; ++complete) {
906  const char *name = strrchr(eft[e], '/');
907  TEST_REJECT_NULL(name);
908  name++;
909 
910  TEST_ANNOTATE(name);
911 
912  {
913  export_format efo;
914  TEST_EXPECT_NO_ERROR(read_export_format(&efo, eft[e], complete));
915  if (strcmp(name, "fasta_wacc.eft") == 0) { // test description of one filter
917  "Exports sequences to fasta-format.\n"
918  "Header exported as: >ID SEQLENGTH bp SEQTYPE ACC");
919  }
920  }
921 
922  if (complete) {
923  const char *outname = "impexp/exported";
924  char *used_outname = NULp;
925 
926  {
927  GB_transaction ta(gb_main);
929  filter, 0, 0,
930  "DBname", eft[e], NULp, // @@@ currently only tests export w/o FTS (pass FTS for some formats? or separately)
931  outname, 0, &used_outname));
932  }
933 
934  char *expected = GBS_global_string_copy("impexp/%s.exported", name);
935 
936 #if defined(TEST_AUTO_UPDATE)
937 #if defined(TEST_AUTO_UPDATE_ONLY_MISSING)
938  if (GB_is_regularfile(expected)) {
939  TEST_EXPECT_TEXTFILE_DIFFLINES_IGNORE_DATES(outname, expected, 0);
940  }
941  else
942 #else
943  {
944  TEST_COPY_FILE(outname, expected);
945  }
946 #endif
947 #else
948  TEST_EXPECT_TEXTFILE_DIFFLINES_IGNORE_DATES(outname, expected, 0);
949  // see ../../UNIT_TESTER/run/impexp
950 #endif // TEST_AUTO_UPDATE
951  TEST_EXPECT_ZERO_OR_SHOW_ERRNO(unlink(outname));
952 
953  free(expected);
954  free(used_outname);
955  }
956  }
957  }
958 
959  delete filter;
960  free(export_dir);
961  GB_close(gb_main);
962 }
963 
964 #endif // UNIT_TESTS
ExportWhich
Definition: seqio.hxx:28
GB_ERROR GBK_system(const char *system_command)
Definition: arb_msg.cxx:519
static bool isGap(char c)
Definition: seq_export.cxx:284
const char * GB_ERROR
Definition: arb_core.h:25
GBDATA * select_next(GBDATA *gb_previous) const
Definition: seq_export.cxx:197
GBDATA * GB_open(const char *path, const char *opent)
Definition: ad_load.cxx:1363
char * oname
Definition: readseq.c:463
char * GB_read_fp(FILE *in)
Definition: adsocket.cxx:271
static export_sequence_data * esd
Definition: seq_export.cxx:471
void GB_warning(const char *message)
Definition: arb_msg.cxx:484
GB_ERROR detectVerticalGaps()
Definition: seq_export.cxx:321
GBDATA * GBT_first_marked_species(GBDATA *gb_main)
Definition: aditem.cxx:113
GBDATA * GB_child(GBDATA *father)
Definition: adquery.cxx:322
return string(buffer, length)
char * GBS_string_eval_in_env(const char *insource, const char *icommand, const GBL_call_env &callEnv)
Definition: admatch.cxx:493
static gb_export_sequence_cb get_export_sequence
Definition: adlang1.cxx:33
void GB_unlink_or_warn(const char *path, GB_ERROR *error)
Definition: arb_file.cxx:206
NOT4PERL void GB_set_export_sequence_hook(gb_export_sequence_cb escb)
Definition: adlang1.cxx:35
bool isGap(char c)
bool read_string_pair(FILE *in, char *&s1, char *&s2, size_t &lineNr)
Definition: seqio.cxx:37
GB_ERROR GB_IO_error(const char *action, const char *filename)
Definition: arb_msg.cxx:293
char * ARB_strdup(const char *str)
Definition: arb_string.h:27
GBDATA * get_gb_main() const
Definition: seq_export.cxx:275
char * GB_read_as_string(GBDATA *gbd)
Definition: arbdb.cxx:1054
const char * ARB_date_string()
Definition: arb_string.cxx:35
bool hasError() const
Definition: ErrorOrType.h:48
TYPE getValue() const
Definition: ErrorOrType.h:56
const char * GBS_global_string(const char *templat,...)
Definition: arb_msg.cxx:204
long GBT_get_alignment_len(GBDATA *gb_main, const char *aliname)
Definition: adali.cxx:706
bool GB_have_error()
Definition: arb_msg.cxx:349
char * GBS_string_eval(const char *insource, const char *icommand)
Definition: admatch.cxx:699
GBDATA * select_first(GBDATA *gb_main) const
Definition: seq_export.cxx:188
void set_single_mode(GBDATA *gb_species)
Definition: seq_export.cxx:277
void auto_subtitles(const char *prefix)
Definition: arb_progress.h:286
SpeciesSelector(ExportWhich which_, const char *one_species_)
Definition: seq_export.cxx:184
SmartCharPtr description
Definition: seqio.hxx:54
static EXPORT_CMD check_internal(const char *command)
Definition: seq_export.cxx:51
GBDATA * GB_get_father(GBDATA *gbd)
Definition: arbdb.cxx:1720
GB_CSTR GB_canonical_path(const char *anypath)
Definition: adsocket.cxx:968
FILE * seq
Definition: rns.c:46
EXPORT_CMD
Definition: seq_export.cxx:38
GB_CSTR GBS_find_string(GB_CSTR cont, GB_CSTR substr, int match_mode)
Definition: admatch.cxx:103
static GB_ERROR export_format_single(const char *db_name, const char *formname, const char *outname, char **resulting_outname, RuleSetPtr ruleset)
Definition: seq_export.cxx:577
static const char * internal_export_commands[]
Definition: seq_export.cxx:46
GB_ERROR GB_await_error()
Definition: arb_msg.cxx:353
EXPORT_CMD export_mode
Definition: seq_export.cxx:71
char * GB_create_tempfile(const char *name)
Definition: adsocket.cxx:1188
long GB_read_count(GBDATA *gbd)
Definition: arbdb.cxx:752
Definition: arbdb.h:78
GB_TYPES GB_read_type(GBDATA *gbd)
Definition: arbdb.cxx:1641
static GB_ERROR export_species_using_form(FILE *out, const char *form, const GBL_call_env &callEnv)
Definition: seq_export.cxx:537
GB_ERROR deliver() const
Definition: arb_error.h:114
void GB_warningf(const char *templat,...)
Definition: arb_msg.cxx:490
GB_CSTR GB_read_key_pntr(GBDATA *gbd)
Definition: arbdb.cxx:1654
bool isSet() const
test if SmartPtr is not NULp
Definition: smartptr.h:245
char * pre_format
Definition: seq_export.cxx:66
Generic smart pointer.
Definition: smartptr.h:149
static GB_ERROR export_format_multiple(const char *dbname, const char *formname, const char *outname, bool multiple, char **resulting_outname, RuleSetPtr ruleset)
Definition: seq_export.cxx:703
static GB_ERROR XML_recursive(GBDATA *gbd, int depth)
Definition: seq_export.cxx:478
#define TEST_REJECT_NULL(n)
Definition: test_unit.h:1310
static void error(const char *msg)
Definition: mkptypes.cxx:96
GB_CSTR GB_path_in_ARBHOME(const char *relative_path)
Definition: adsocket.cxx:1121
size_t get_length() const
Definition: AP_filter.hxx:83
GBDATA * GBT_next_marked_species(GBDATA *gb_species)
Definition: aditem.cxx:116
GB_ERROR get_exportFormat_information(const char *eft_formname, ExportFormatInfo &info)
Definition: seq_export.cxx:806
const char * get_export_sequence(GBDATA *gb_species, size_t &seq_len, GB_ERROR &error)
Definition: seq_export.cxx:405
GB_ERROR export_by_format(GBDATA *gb_main, ExportWhich which, const char *one_species, AP_filter *filter, int cut_stop_codon, int compress, const char *dbname, const char *formname, const char *field_transfer_set, const char *outname, int multiple, char **real_outname)
Definition: seq_export.cxx:757
GB_alignment_type GBT_get_alignment_type(GBDATA *gb_main, const char *aliname)
Definition: adali.cxx:740
#define TEST_EXPECT_ZERO_OR_SHOW_ERRNO(iocond)
Definition: test_unit.h:1079
bool GB_is_container(GBDATA *gbd)
Definition: arbdb.cxx:1646
long getAliLen() const
Definition: seq_export.cxx:274
#define TEST_EXPECT_TEXTFILE_DIFFLINES_IGNORE_DATES(fgot, fwant, diff)
Definition: test_unit.h:1391
GB_CSTR GB_path_in_ARBLIB(const char *relative_path)
Definition: adsocket.cxx:1124
static GB_ERROR read_export_format(export_format *efo, const char *file, bool load_complete_form)
Definition: seq_export.cxx:90
void GBS_read_dir(StrArray &names, const char *dir, const char *mask)
Definition: adfile.cxx:213
GBDATA * GBT_find_sequence(GBDATA *gb_species, const char *aliname)
Definition: adali.cxx:670
GB_alignment_type
Definition: arbdb_base.h:61
a xml text node
Definition: xml.hxx:122
GB_CSTR GB_append_suffix(const char *name, const char *suffix)
Definition: adsocket.cxx:952
char * GB_unique_filename(const char *name_prefix, const char *suffix)
Definition: adsocket.cxx:1199
void appendTo(char *&content, char sep, char *&toAppend)
Definition: seqio.hxx:34
fputs(TRACE_PREFIX, stderr)
GB_CSTR GB_concat_path(GB_CSTR anypath_left, GB_CSTR anypath_right)
Definition: adsocket.cxx:1037
SmartCharPtr suffix
Definition: seqio.hxx:53
void GB_write_flag(GBDATA *gbd, long flag)
Definition: arbdb.cxx:2761
export_sequence_data(GBDATA *Gb_Main, ExportWhich which, const char *one_species, AP_filter *Filter, bool CutStopCodon, int Compress)
Definition: seq_export.cxx:232
#define sio_assert(cond)
Definition: seq_export.cxx:29
GBDATA * first_species() const
Definition: seq_export.cxx:280
const struct formatTable formname[]
GB_ERROR GB_failedTo_error(const char *do_something, const char *special, GB_ERROR error)
Definition: arb_msg.cxx:387
GBDATA * GBT_first_species(GBDATA *gb_main)
Definition: aditem.cxx:124
GB_ERROR is_invalid() const
Definition: AP_filter.hxx:123
char * get_exportFormat_evalForm(const char *eft_formname, GB_ERROR &error)
Definition: seq_export.cxx:824
const char * GBS_static_string(const char *str)
Definition: arb_msg.cxx:213
#define TEST_EXPECT_NO_ERROR(call)
Definition: test_unit.h:1107
unsigned char uchar
Definition: gde.hxx:21
void GB_split_full_path(const char *fullpath, char **res_dir, char **res_fullname, char **res_name_only, char **res_suffix)
Definition: adsocket.cxx:1234
bool is_std_gap(const char c)
GBDATA * GBT_next_species(GBDATA *gb_species)
Definition: aditem.cxx:128
#define NULp
Definition: cxxforward.h:97
bool GB_is_regularfile(const char *path)
Definition: arb_file.cxx:76
GBDATA * GBT_find_species(GBDATA *gb_main, const char *name)
Definition: aditem.cxx:139
void GB_informationf(const char *templat,...)
Definition: arb_msg.cxx:502
const char * getAlignment() const
Definition: seq_export.cxx:273
static char * command
Definition: arb_a2ps.c:319
bool in_single_mode() const
Definition: seq_export.cxx:278
char * GBT_get_default_alignment(GBDATA *gb_main)
Definition: adali.cxx:675
ARB_ERROR getError() const
Definition: ErrorOrType.h:51
void GBT_mark_all(GBDATA *gb_main, int flag)
Definition: aditem.cxx:295
GBDATA * GB_nextChild(GBDATA *child)
Definition: adquery.cxx:326
GB_transaction ta(gb_var)
GB_CSTR GB_read_char_pntr(GBDATA *gbd)
Definition: arbdb.cxx:898
GBDATA * gb_main
Definition: adname.cxx:33
GB_CSTR GBT_get_name_or_description(GBDATA *gb_item)
Definition: aditem.cxx:441
static int info[maxsites+1]
static const char * exported_sequence(GBDATA *gb_species, size_t *seq_len, GB_ERROR *error)
Definition: seq_export.cxx:473
GBDATA * next_species(GBDATA *gb_prev) const
Definition: seq_export.cxx:281
const char * GBT_read_char_pntr(GBDATA *gb_container, const char *fieldpath)
Definition: adtools.cxx:307
#define TEST_EXPECT_EQUAL(expr, want)
Definition: test_unit.h:1283
static GB_ERROR export_write_species(GBDATA *gb_species, FILE *out, const GBL_env &env, const export_format &efo)
Definition: seq_export.cxx:557
void inc_and_check_user_abort(GB_ERROR &error)
Definition: arb_progress.h:274
char * GBS_global_string_copy(const char *templat,...)
Definition: arb_msg.cxx:195
void GB_close(GBDATA *gbd)
Definition: arbdb.cxx:649
char * description
Definition: seq_export.cxx:68
bool GB_is_privatefile(const char *path, bool read_private)
Definition: arb_file.cxx:124
const unsigned char * get_seq_data(GBDATA *gb_species, size_t &slen, GB_ERROR &error) const
Definition: seq_export.cxx:304