ARB
seq_export.cxx
Go to the documentation of this file.
1 // ============================================================= //
2 // //
3 // File : seq_export.cxx //
4 // Purpose : //
5 // //
6 // Institute of Microbiology (Technical University Munich) //
7 // http://www.arb-home.de/ //
8 // //
9 // ============================================================= //
10 
11 #include "seqio.hxx"
12 
13 #include <AP_filter.hxx>
14 #include <xferset.h>
15 
16 #include <arbdbt.h>
17 #include <gb_aci.h>
18 
19 #include <arb_strarray.h>
20 #include <arb_file.h>
21 #include <arb_diff.h>
22 #include <arb_progress.h>
23 #include <arb_global_defs.h>
24 
25 #include <xml.hxx>
26 
27 #include <unistd.h>
28 
29 #define sio_assert(cond) arb_assert(cond)
30 
31 using std::string;
32 using namespace SEQIO;
33 using namespace FieldTransfer;
34 
35 // ---------------------------------
36 // internal export commands
37 
38 enum EXPORT_CMD {
39  // real formats
41 
43  EXPORT_USING_FORM, // default mode (has to be last entry in enum)
44 };
45 
46 static const char *internal_export_commands[] = {
47  "xml_write",
48  NULp
49 };
50 
51 static EXPORT_CMD check_internal(const char *command) {
53  for (int i = 0; internal_export_commands[i]; ++i) {
54  if (strcmp(command, internal_export_commands[i]) == 0) {
55  cmd = static_cast<EXPORT_CMD>(i);
56  }
57  }
58  return cmd;
59 }
60 
61 // ----------------------
62 // export_format
63 
64 struct export_format : virtual Noncopyable {
65  char *system;
66  char *pre_format;
67  char *suffix;
68  char *description; // (multiline) description of filter
69  char *form; // transformed export expression (part behind 'BEGIN')
70 
72 
74  : system(NULp),
75  pre_format(NULp),
76  suffix(NULp),
77  description(NULp),
78  form(NULp),
79  export_mode(EXPORT_XML)
80  {}
82  free(system);
83  free(pre_format);
84  free(suffix);
85  free(description);
86  free(form);
87  }
88 };
89 
90 static GB_ERROR read_export_format(export_format *efo, const char *file, bool load_complete_form) {
92 
93  if (!file || !file[0]) {
94  error = "No export format selected";
95  }
96  else {
97  char *fullfile = NULp;
98  if (GB_is_regularfile(file)) { // prefer files that are completely specified (full/rel path)
99  fullfile = strdup(GB_canonical_path(file));
100  }
101  else {
102  fullfile = nulldup(GB_path_in_ARBHOME(file)); // fallback to ARBHOME-relative specification
103  }
104 
105  FILE *in = fopen(fullfile, "r");
106 
107  if (!in) error = GB_IO_error("reading export form", fullfile);
108  else {
109  efo->export_mode = EXPORT_USING_FORM; // default mode
110  {
111  bool seen_BEGIN = false;
112  char *s1, *s2;
113  size_t linenumber = 0;
114 
115  while (!error && !seen_BEGIN && read_string_pair(in, s1, s2, linenumber)) {
116  if (!strcmp(s1, "SYSTEM")) { reassign(efo->system, s2); }
117  else if (!strcmp(s1, "PRE_FORMAT")) { reassign(efo->pre_format, s2); }
118  else if (!strcmp(s1, "SUFFIX")) { reassign(efo->suffix, s2); }
119  else if (!strcmp(s1, "DESCRIPTION")) { appendTo(efo->description, '\n', s2); }
120  else if (!strcmp(s1, "INTERNAL")) {
121  efo->export_mode = check_internal(s2);
122  if (efo->export_mode == EXPORT_INVALID) {
123  error = GBS_global_string("Unknown INTERNAL command '%s'", s2);
124  }
125  }
126  else if (!strcmp(s1, "BEGIN")) {
127  if (efo->export_mode != EXPORT_USING_FORM) {
128  error = "'BEGIN' not allowed when 'INTERNAL' is used";
129  }
130  else {
131  seen_BEGIN = true;
132  }
133  }
134  else {
135  error = GBS_global_string("Unknown command '%s'", s1);
136  }
137 
138  // add error location
139  if (error) error = GBS_global_string("%s in line #%zu", error, linenumber);
140 
141  free(s2);
142  free(s1);
143  }
144  }
145 
146  if (!error && load_complete_form && efo->export_mode == EXPORT_USING_FORM) {
147  // now 'in' points to line behind 'BEGIN'
148  char *form = GB_read_fp(in); // read rest of file
149 
150  // Join lines that end with \ with next line.
151  // Replace ' = ' and ':' by '\=' and '\:'
152  efo->form = GBS_string_eval(form, "\\\\\n=:\\==\\\\\\=:*=\\*\\=*1:\\:=\\\\\\:");
153  if (!efo->form) error = GB_failedTo_error("evaluate part below 'BEGIN'", NULp, GB_await_error());
154  free(form);
155  }
156 
157  // some checks for incompatible commands
158  if (!error) {
159  if (efo->system && !efo->pre_format) error = "Missing 'PRE_FORMAT' (needed by 'SYSTEM')";
160  else if (efo->pre_format && !efo->system) error = "Missing 'SYSTEM' (needed by 'PRE_FORMAT')";
161  else if (efo->export_mode != EXPORT_USING_FORM) {
162  if (efo->system) error = "'SYSTEM' is not allowed together with 'INTERNAL'";
163  if (efo->pre_format) error = "'PRE_FORMAT' is not allowed together with 'INTERNAL'";
164  }
165  }
166 
167  error = GB_failedTo_error("read export format", fullfile, error);
168  fclose(in);
169  }
170  free(fullfile);
171  }
172 
173  return error;
174 }
175 
176 // ----------------------------------------
177 // export sequence helper class
178 
179 class SpeciesSelector : virtual Noncopyable {
180  ExportWhich which;
181  const char *one_species;
182 
183 public:
184  SpeciesSelector(ExportWhich which_, const char *one_species_) :
185  which(which_),
186  one_species(one_species_)
187  {}
189  GBDATA *gb_species = NULp;
190  switch (which) {
191  case EBF_ALL: gb_species = GBT_first_species(gb_main); break;
192  case EBF_MARKED: gb_species = GBT_first_marked_species(gb_main); break;
193  case EBF_ONE: gb_species = GBT_find_species(gb_main, one_species); break;
194  }
195  return gb_species;
196  }
197  GBDATA *select_next(GBDATA *gb_previous) const {
198  GBDATA *gb_species = NULp;
199  switch (which) {
200  case EBF_ALL: gb_species = GBT_next_species(gb_previous); break;
201  case EBF_MARKED: gb_species = GBT_next_marked_species(gb_previous); break;
202  case EBF_ONE: break;
203  }
204  return gb_species;
205  }
206 };
207 
208 class export_sequence_data : virtual Noncopyable { // @@@ simplify using FilteredExport?
209  GBDATA *last_species_read;
210  char *seq;
211  size_t len;
212  char *error;
213 
214  GBDATA *gb_main;
215  char *ali;
216 
217  SpeciesSelector whichSpecies;
218 
219  size_t species_count;
220  AP_filter *filter;
221  bool cut_stop_codon;
222  int compress; // 0 = no;1 = vertical gaps; 2 = all gaps;
223 
224  long max_ali_len; // length of alignment
225  size_t *export_column; // list of exported seq data positions
226  size_t columns; // how many columns get exported
227 
228  GBDATA *single_species; // if set to species -> first/next only return this species (used to export to multiple files)
229 
230 public:
231 
232  export_sequence_data(GBDATA *Gb_Main, ExportWhich which, const char *one_species, AP_filter* Filter, bool CutStopCodon, int Compress) :
233  last_species_read(NULp),
234  seq(NULp),
235  len(0),
236  error(NULp),
237  gb_main(Gb_Main),
238  whichSpecies(which, one_species),
239  species_count(size_t(-1)),
240  filter(Filter),
241  cut_stop_codon(CutStopCodon),
242  compress(Compress),
243  export_column(NULp),
244  columns(0),
245  single_species(NULp)
246  {
247  sio_assert(filter);
248  sio_assert(!filter->is_invalid()); // you have to pass a valid filter
249 
250  ali = GBT_get_default_alignment(gb_main);
251  sio_assert(ali); // cannot occur (when no ali selected/exist -> filter would have been invalid above)
252 
253  max_ali_len = GBT_get_alignment_len(gb_main, ali);
254  sio_assert(max_ali_len>0);
255 
256  if (cut_stop_codon) {
257  GB_alignment_type ali_type = GBT_get_alignment_type(gb_main, ali);
258  sio_assert(ali_type != GB_AT_UNKNOWN);
259  if (ali_type != GB_AT_AA) {
260  GB_warning("Cutting stop codon makes no sense - ignored");
261  cut_stop_codon = false;
262  }
263  }
264 
265  if (max_ali_len>=0 && filter->get_length() < size_t(max_ali_len)) {
266  GB_warningf("Warning: Your filter is shorter than the alignment (%zu<%li)",
267  filter->get_length(), max_ali_len);
268  max_ali_len = filter->get_length();
269  }
270  }
271 
273  delete [] export_column;
274  delete [] seq;
275  free(error);
276  free(ali);
277  }
278 
279  const char *getAlignment() const { return ali; }
280  long getAliLen() const { return max_ali_len; }
281  GBDATA *get_gb_main() const { sio_assert(gb_main); return gb_main; }
282 
283  void set_single_mode(GBDATA *gb_species) { single_species = gb_species; }
284  bool in_single_mode() const { return single_species; }
285 
286  GBDATA *first_species() const { return single_species ? single_species : whichSpecies.select_first(gb_main); }
287  GBDATA *next_species(GBDATA *gb_prev) const { return single_species ? NULp : whichSpecies.select_next(gb_prev); }
288 
289  const unsigned char *get_seq_data(GBDATA *gb_species, size_t& slen, GB_ERROR& error) const;
290  static bool isGap(char c) { return GAP::is_std_gap(c); }
291 
292  size_t count_species() {
293  sio_assert(!in_single_mode());
294  if (species_count == size_t(-1)) {
295  species_count = 0;
296  for (GBDATA *gb_species = whichSpecies.select_first(gb_main);
297  gb_species;
298  gb_species = whichSpecies.select_next(gb_species))
299  {
300  species_count++;
301  }
302  }
303  return species_count;
304  }
305 
306  GB_ERROR detectVerticalGaps();
307  const char *get_export_sequence(GBDATA *gb_species, size_t& seq_len, GB_ERROR& error);
308 };
309 
310 const unsigned char *export_sequence_data::get_seq_data(GBDATA *gb_species, size_t& slen, GB_ERROR& err) const {
311  const char *data = NULp;
312  GBDATA *gb_seq = GBT_find_sequence(gb_species, ali);
313 
314  if (!gb_seq) {
315  err = GBS_global_string_copy("No data in alignment '%s' of species '%s'", ali, GBT_get_name_or_description(gb_species));
316  slen = 0;
317  }
318  else {
319  data = GB_read_char_pntr(gb_seq);
320  slen = GB_read_count(gb_seq);
321  err = NULp;
322  }
323  return (const unsigned char *)data;
324 }
325 
326 
328  GB_ERROR err = NULp;
329 
330  sio_assert(!in_single_mode());
331 
332  if (compress == 1) { // compress vertical gaps!
333  // @@@ detection of vertical gaps should better be done either by AP_filter directly or by FilteredExport
334 
335  size_t gap_columns = filter->get_filtered_length();
336  size_t *gap_column = new size_t[gap_columns+1];
337 
338  const size_t *filterpos_2_seqpos = filter->get_filterpos_2_seqpos();
339  memcpy(gap_column, filterpos_2_seqpos, gap_columns*sizeof(*gap_column));
340  gap_column[gap_columns] = max_ali_len;
341 
342  arb_progress progress("Calculating vertical gaps", count_species());
343 
344  for (GBDATA *gb_species = first_species();
345  gb_species && !err;
346  gb_species = next_species(gb_species))
347  {
348  size_t slen;
349  const unsigned char *sdata = get_seq_data(gb_species, slen, err);
350 
351  if (!err) {
352  size_t j = 0;
353  size_t i;
354  for (i = 0; i<gap_columns; ++i) {
355  if (isGap(sdata[gap_column[i]])) {
356  gap_column[j++] = gap_column[i]; // keep gap column
357  }
358  // otherwise it's overwritten
359  }
360 
361  sio_assert(i >= j);
362  size_t skipped_columns = i-j;
363  sio_assert(gap_columns >= skipped_columns);
364  gap_columns -= skipped_columns;
365  }
366  progress.inc_and_check_user_abort(err);
367  }
368 
369  if (!err) {
370  columns = filter->get_filtered_length() - gap_columns;
371  export_column = new size_t[columns];
372 
373  size_t gpos = 0; // index into array of vertical gaps
374  size_t epos = 0; // index into array of exported columns
375  size_t flen = filter->get_filtered_length();
376  size_t a;
377  for (a = 0; a<flen && gpos<gap_columns; ++a) {
378  size_t fpos = filterpos_2_seqpos[a];
379  if (fpos == gap_column[gpos]) { // only gaps here -> skip column
380  gpos++;
381  }
382  else { // not only gaps -> use column
383  sio_assert(fpos<gap_column[gpos]);
384  sio_assert(epos < columns); // got more columns than expected
385  export_column[epos++] = fpos;
386  }
387  }
388  for (; a<flen; ++a) { // LOOP_VECTORIZED
389  export_column[epos++] = filterpos_2_seqpos[a];
390  }
391 
392  sio_assert(epos == columns);
393  }
394 
395  delete [] gap_column;
396  }
397  else { // compress all or none (simply use filter)
398  const size_t *filterpos_2_seqpos = filter->get_filterpos_2_seqpos();
399 
400  columns = filter->get_filtered_length();
401  export_column = new size_t[columns];
402 
403  memcpy(export_column, filterpos_2_seqpos, columns*sizeof(*filterpos_2_seqpos));
404  }
405 
406  seq = new char[columns+1];
407 
408  return err;
409 }
410 
411 const char *export_sequence_data::get_export_sequence(GBDATA *gb_species, size_t& seq_len, GB_ERROR& err) {
412  if (gb_species != last_species_read) {
413  freenull(error);
414 
415  // read + filter a new species
416  GB_ERROR curr_error;
417  const unsigned char *data = get_seq_data(gb_species, len, curr_error);
418 
419  if (curr_error) {
420  error = strdup(curr_error);
421  }
422  else {
423  size_t i;
424  const uchar *simplify = filter->get_simplify_table();
425 
426  if (cut_stop_codon) {
427  const unsigned char *stop_codon = (const unsigned char *)memchr(data, '*', len);
428  if (stop_codon) {
429  len = stop_codon-data;
430  }
431  }
432 
433  if (compress == 2) { // compress all gaps
434  size_t j = 0;
435  for (i = 0; i<columns; ++i) {
436  size_t seq_pos = export_column[i];
437  if (seq_pos<len) {
438  unsigned char c = data[seq_pos];
439  if (!isGap(c)) {
440  seq[j++] = simplify[c];
441  }
442  }
443  }
444  seq[j] = 0;
445  len = j;
446  }
447  else { // compress vertical or compress none (simply use filter in both cases)
448  for (i = 0; i<columns; ++i) {
449  size_t seq_pos = export_column[i];
450  if (seq_pos<len) {
451  seq[i] = simplify[data[seq_pos]];
452  }
453  else {
454  seq[i] = simplify['.'];
455  }
456  }
457  seq[i] = 0;
458  len = columns;
459  }
460  }
461  }
462 
463  err = error;
464  if (error) {
465  seq_len = 0;
466  return NULp;
467  }
468 
469  seq_len = len;
470  return seq;
471 }
472 
473 // ----------------------------------------
474 // exported_sequence is hooked into ACI temporary (provides result of command 'export_sequence')
475 // which is the sequence filtered and compressed according to settings in the export window
476 
478 
479 static const char *exported_sequence(GBDATA *gb_species, size_t *seq_len, GB_ERROR *error) {
480  sio_assert(esd);
481  return esd->get_export_sequence(gb_species, *seq_len, *error);
482 }
483 
484 static GB_ERROR XML_recursive(GBDATA *gbd, int depth) {
485  GB_ERROR error = NULp;
486  const char *key_name = GB_read_key_pntr(gbd);
487  XML_Tag *tag = NULp;
488  bool descend = true;
489 
490  if (depth == 1 && strncmp(key_name, "ali_", 4) == 0) { // hack needed if seq-quality information exists
491  sio_assert(esd);
492  descend = false; // do not descend into alignments
493  if (strcmp(esd->getAlignment(), key_name) == 0) { // the wanted alignment
494 
495  tag = new XML_Tag("ALIGNMENT");
496  tag->add_attribute("name", key_name+4);
497 
498  GBDATA *gb_species = GB_get_father(gbd);
499  size_t len;
500  const char *seq = exported_sequence(gb_species, &len, &error);
501 
502  if (seq) {
503  XML_Tag dtag("data");
504  { XML_Text seqText(seq); }
505  }
506  }
507  }
508  else {
509  tag = new XML_Tag(key_name);
510 
511  if (GB_is_container(gbd)) {
512  const char *name = GBT_read_char_pntr(gbd, "name");
513  if (name) tag->add_attribute("name", name);
514  }
515  }
516 
517  if (descend) {
518  if (GB_read_type(gbd) == GB_DB) {
519  for (GBDATA *gb_child = GB_child(gbd); gb_child && !error; gb_child = GB_nextChild(gb_child)) {
520  const char *sub_key_name = GB_read_key_pntr(gb_child);
521 
522  if (strcmp(sub_key_name, "name") != 0) { // do not recurse for "name" (is handled above)
523  error = XML_recursive(gb_child, depth+1);
524  }
525  }
526  }
527  else {
528  char *content = GB_read_as_string(gbd);
529  if (content) {
530  XML_Text text(content);
531  free(content);
532  }
533  else {
534  tag->add_attribute("error", "unsavable");
535  }
536  }
537  }
538 
539  delete tag;
540  return error;
541 }
542 
543 static GB_ERROR export_species_using_form(FILE *out, const char *form, const GBL_call_env& callEnv) { // @@@ pass preparsed command (form)
544  GB_ERROR error = NULp;
545  char *pars = GBS_string_eval_in_env(" ", form, callEnv);
546  if (!pars) error = GB_await_error();
547  else {
548  char *p;
549  char *o = pars;
550  while ((p = GBS_find_string(o, "$$DELETE_LINE$$", 0))) {
551  char *l, *r;
552  for (l = p; l>o; l--) if (*l=='\n') break;
553  r = strchr(p, '\n'); if (!r) r = p + strlen(p);
554  fwrite(o, 1, l-o, out);
555  o = r;
556  }
557  fputs(o, out);
558  free(pars);
559  }
560  return error;
561 }
562 
563 static GB_ERROR export_write_species(GBDATA *gb_species, FILE *out, const GBL_env& env, const export_format& efo) {
564  GB_ERROR error = NULp;
565  switch (efo.export_mode) {
566  case EXPORT_USING_FORM: {
567  GBL_call_env callEnv(gb_species, env);
568  error = export_species_using_form(out, efo.form, callEnv);
569  break;
570  }
571 
572  case EXPORT_XML:
573  error = XML_recursive(gb_species, 0);
574  break;
575 
576  case EXPORT_INVALID:
577  sio_assert(0);
578  break;
579  }
580  return error;
581 }
582 
583 static GB_ERROR export_format_single(const char *db_name, const char *formname, const char *outname, char **resulting_outname, RuleSetPtr ruleset) {
584  // Exports sequences specified by 'esd' (module global variable)
585  // to format specified by 'formname'.
586  //
587  // if 'outname' == NULp -> export species to temporary file, otherwise to 'outname'.
588  // Full path of generated file is returned in 'resulting_outname'
589 
590  static int export_depth = 0;
591  export_depth++;
592 
593  *resulting_outname = NULp;
594 
595  export_format efo;
596  GB_ERROR error = read_export_format(&efo, formname, true);
597 
598  if (!error) {
599  if (!outname) { // if no 'outname' is given -> export to temporary file
600  char *unique_outname = GB_unique_filename("exported", efo.suffix);
601  *resulting_outname = GB_create_tempfile(unique_outname);
602  free(unique_outname);
603 
604  if (!*resulting_outname) error = GB_await_error();
605  }
606  else *resulting_outname = strdup(outname);
607  }
608 
609  sio_assert(error || *resulting_outname);
610 
611  if (!error) {
612  if (efo.pre_format) {
613  // Export data using format 'pre_format'.
614  // Afterwards convert to wanted format using 'system'.
615 
616  sio_assert(efo.system);
617 
618  char *intermediate_export;
619  error = export_format_single(db_name, efo.pre_format, NULp, &intermediate_export, ruleset);
620  if (!error) {
621  sio_assert(GB_is_privatefile(intermediate_export, false));
622 
623  GB_informationf("Converting to %s", efo.suffix);
624 
625  char *srt = GBS_global_string_copy("$<=%s:$>=%s", intermediate_export, *resulting_outname);
626  char *sys = GBS_string_eval(efo.system, srt);
627 
628  GB_informationf("exec '%s'", efo.system);
629  error = GBK_system(sys);
630 
631  GB_unlink_or_warn(intermediate_export, &error);
632 
633  free(sys);
634  free(srt);
635  }
636  free(intermediate_export);
637  }
638  else {
639  FILE *out = fopen(*resulting_outname, "wt");
640  if (!out) error = GB_IO_error("writing", *resulting_outname);
641  else {
642  XML_Document *xml = NULp;
643 
644  long allCount = 0;
645  for (GBDATA *gb_species = esd->first_species();
646  gb_species && !error;
647  gb_species = esd->next_species(gb_species))
648  {
649  allCount++;
650  }
651 
652  arb_progress progress(allCount);
653  progress.auto_subtitles("Saving species");
654 
655  if (efo.export_mode == EXPORT_XML) {
656  xml = new XML_Document("ARB_SEQ_EXPORT", "arb_seq_export.dtd", out);
657  {
658  xml->add_attribute("database", db_name);
659  }
660  xml->add_attribute("export_date", ARB_date_string());
661  {
662  XML_Comment rem("There is a basic version of ARB_seq_export.dtd in $ARBHOME/lib/dtd\n"
663  "but you might need to expand it by yourself,\n"
664  "because the ARB-database may contain any kind of fields.");
665  }
666  }
667 
668  GBL_env env(esd->get_gb_main(), NULp);
669 
670  for (GBDATA *gb_species = esd->first_species();
671  gb_species && !error;
672  gb_species = esd->next_species(gb_species))
673  {
674  if (ruleset.isSet()) {
675  GB_topSecurityLevel unsecured(env.get_gb_main()); // needed to clone species (overwrites name .. in temporary clone)
677  if (clone.has_error()) {
678  error = clone.get_error();
679  }
680  else {
681  GB_previousSecurityLevel user(unsecured); // run export itself with normal security
682  error = export_write_species(clone.get_clone(), out, env, efo);
683  }
684  }
685  else {
686  error = export_write_species(gb_species, out, env, efo);
687  }
688  progress.inc_and_check_user_abort(error);
689  }
690 
691  delete xml;
692  fclose(out);
693  }
694  }
695  }
696 
697  if (error) {
698  if (*resulting_outname) {
699  GB_unlink_or_warn(*resulting_outname, NULp);
700  freenull(*resulting_outname);
701  }
702  }
703 
704  export_depth--;
705 
706  return error;
707 }
708 
709 static GB_ERROR export_format_multiple(const char* dbname, const char *formname, const char *outname, bool multiple, char **resulting_outname, RuleSetPtr ruleset) {
710  GB_ERROR error = NULp;
711 
712  if (multiple) {
713  char *path, *name, *suffix;
714  GB_split_full_path(outname, &path, NULp, &name, &suffix);
715  *resulting_outname = NULp;
716 
717  arb_progress progress("Exporting data", esd->count_species());
718 
719  for (GBDATA *gb_species = esd->first_species();
720  gb_species && !error;
721  gb_species = esd->next_species(gb_species))
722  {
723  const char *species_name = GBT_read_char_pntr(gb_species, "name");
724  if (!species_name) error = "Can't export unnamed species";
725  else {
726  const char *fname = GB_append_suffix(GBS_global_string("%s_%s", name, species_name), suffix);
727  progress.subtitle(fname);
728 
729  char *oname = strdup(GB_concat_path(path, fname));
730  char *res_oname;
731 
732  esd->set_single_mode(gb_species); // means: only export 'gb_species'
733  error = export_format_single(dbname, formname, oname, &res_oname, ruleset);
734  esd->set_single_mode(NULp);
735 
736  if (!*resulting_outname || // not set yet
737  (res_oname && strcmp(*resulting_outname, res_oname)>0)) // or smaller than set one
738  {
739  reassign(*resulting_outname, res_oname);
740  }
741 
742  free(res_oname);
743  free(oname);
744  }
745 
746  progress.inc_and_check_user_abort(error);
747  }
748 
749  free(suffix);
750  free(name);
751  free(path);
752  }
753  else {
754  arb_progress progress("Exporting data");
755  error = export_format_single(dbname, formname, outname, resulting_outname, ruleset);
756  }
757 
758  return error;
759 }
760 
761 namespace SEQIO {
762 
763  GB_ERROR export_by_format(GBDATA *gb_main, ExportWhich which, const char *one_species,
764  AP_filter *filter, int cut_stop_codon, int compress,
765  const char *dbname, const char *formname, const char *field_transfer_set,
766  const char *outname, int multiple, char **real_outname)
767  {
769 
770  if (field_transfer_set && !field_transfer_set[0]) { // empty 'field_transfer_set' given
771  field_transfer_set = NULp; // -> handle like NULp
772  }
773 
774  GB_ERROR error = filter->is_invalid();
775 
776  RuleSetPtr ruleset;
777  if (!error) {
778  if (field_transfer_set) { // if specified load ruleset:
779  ErrorOrRuleSetPtr loaded = RuleSet::loadFrom(field_transfer_set);
780 
781  if (loaded.hasError()) {
782  ARB_ERROR lerror = loaded.getError();
783  error = lerror.deliver();
784  }
785  else {
786  ruleset = loaded.getValue();
787  }
788  }
789  }
790 
791  if (!error) {
792  esd = new export_sequence_data(gb_main, which, one_species, filter, cut_stop_codon, compress);
793  sio_assert(esd->getAliLen()>0);
794 
796 
797  error = esd->detectVerticalGaps();
798  if (!error) {
799  error = export_format_multiple(dbname, formname, outname, multiple, real_outname, ruleset);
800  if (error) error = GBS_static_string(error); // error is member of export_sequence_data -> copy to static buffer
801  }
802 
804  }
805  delete esd;
806  esd = NULp;
807 
809  return error;
810  }
811 
813  export_format efs;
814  GB_ERROR error = read_export_format(&efs, eft_formname, false);
815 
816  if (!error) {
817  if (efs.suffix) {
818  info.suffix = efs.suffix;
819  efs.suffix = NULp;
820  }
821  if (efs.description) {
822  info.description = efs.description;
823  efs.description = NULp;
824  }
825  }
826 
827  return error;
828  }
829 
830  char *get_exportFormat_evalForm(const char *eft_formname, GB_ERROR& error) {
831  // load copy of form that gets evaluated during export.
832  export_format efs;
833  error = read_export_format(&efs, eft_formname, true);
834  if (!error && efs.form) {
835  if (efs.pre_format) {
836  sio_assert(strcmp(efs.form, "*=") == 0); // caused by eval in read_export_format?
837  return get_exportFormat_evalForm(efs.pre_format, error);
838  }
839 
840  sio_assert(efs.pre_format == NULp);
841  return ARB_strdup(efs.form);
842  }
843  // failed to load form
844 
845  sio_assert(efs.form == NULp);
846  sio_assert(efs.pre_format == NULp);
847  if (!error) {
848  if (efs.export_mode != EXPORT_USING_FORM) {
849  if (efs.export_mode == EXPORT_XML) {
850  error = "exports all fields";
851  }
852  else {
853  error = "unsupported filter type";
854  }
855  }
856  else {
857  error = "no form loaded";
858  }
859  }
860 
861  sio_assert(error);
862  if (error) {
863  char *nameOnly = NULp;
864  GB_split_full_path(eft_formname, NULp, &nameOnly, NULp, NULp);
865 
866  const char *shownName = nameOnly ? nameOnly : eft_formname;
867  error = GBS_global_string("%s (%s)", error, shownName);
868 
869  free(nameOnly);
870  }
871  return NULp;
872  }
873 
874 };
875 
876 // --------------------------------------------------------------------------------
877 
878 #ifdef UNIT_TESTS
879 #include <test_unit.h>
880 
881 // uncomment to auto-update exported files
882 // (needed once after changing database or export formats)
883 // #define TEST_AUTO_UPDATE
884 #define TEST_AUTO_UPDATE_ONLY_MISSING // do auto-update only if file is missing
885 
886 void TEST_sequence_export() {
887  GB_shell shell;
888  arb_suppress_progress silence;
889 
890  GBDATA *gb_main = GB_open("TEST_loadsave.arb", "r");
891  char *export_dir = nulldup(GB_path_in_ARBLIB("export"));
892  StrArray eft;
893  GBS_read_dir(eft, export_dir, "*.eft");
894 
895  AP_filter *filter = NULp;
896  {
897  GB_transaction ta(gb_main);
898 
899  char *ali = GBT_get_default_alignment(gb_main);
900  TEST_REJECT_NULL(ali);
901 
902  size_t alilen = GBT_get_alignment_len(gb_main, ali);
903  TEST_REJECT(alilen<=0);
904 
905  filter = new AP_filter(alilen);
906 
907  GBT_mark_all(gb_main, 0);
908  GBDATA *gb_species = GBT_find_species(gb_main, "MetMazei");
909  TEST_REJECT_NULL(gb_species);
910 
911  GB_write_flag(gb_species, 1); // mark
912  free(ali);
913  }
914  for (int e = 0; eft[e]; ++e) {
915  for (int complete = 0; complete <= 1; ++complete) {
916  const char *name = strrchr(eft[e], '/');
917  TEST_REJECT_NULL(name);
918  name++;
919 
920  TEST_ANNOTATE(name);
921 
922  {
923  export_format efo;
924  TEST_EXPECT_NO_ERROR(read_export_format(&efo, eft[e], complete));
925  if (strcmp(name, "fasta_wacc.eft") == 0) { // test description of one filter
927  "Exports sequences to fasta-format.\n"
928  "Header exported as: >ID SEQLENGTH bp SEQTYPE ACC");
929  }
930  }
931 
932  if (complete) {
933  const char *outname = "impexp/exported";
934  char *used_outname = NULp;
935 
936  {
937  GB_transaction ta(gb_main);
939  filter, 0, 0,
940  "DBname", eft[e], NULp, // @@@ currently only tests export w/o FTS (pass FTS for some formats? or separately)
941  outname, 0, &used_outname));
942  }
943 
944  char *expected = GBS_global_string_copy("impexp/%s.exported", name);
945 
946 #if defined(TEST_AUTO_UPDATE)
947 #if defined(TEST_AUTO_UPDATE_ONLY_MISSING)
948  if (GB_is_regularfile(expected)) {
949  TEST_EXPECT_TEXTFILE_DIFFLINES_IGNORE_DATES(outname, expected, 0);
950  }
951  else
952 #else
953  {
954  TEST_COPY_FILE(outname, expected);
955  }
956 #endif
957 #else
958  TEST_EXPECT_TEXTFILE_DIFFLINES_IGNORE_DATES(outname, expected, 0);
959  // see ../../UNIT_TESTER/run/impexp
960 #endif // TEST_AUTO_UPDATE
961  TEST_EXPECT_ZERO_OR_SHOW_ERRNO(unlink(outname));
962 
963  free(expected);
964  free(used_outname);
965  }
966  }
967  }
968 
969  delete filter;
970  free(export_dir);
971  GB_close(gb_main);
972 }
973 
974 #endif // UNIT_TESTS
ExportWhich
Definition: seqio.hxx:28
GB_ERROR GBK_system(const char *system_command)
Definition: arb_msg.cxx:571
static bool isGap(char c)
Definition: seq_export.cxx:290
const char * GB_ERROR
Definition: arb_core.h:25
GBDATA * select_next(GBDATA *gb_previous) const
Definition: seq_export.cxx:197
GBDATA * GB_open(const char *path, const char *opent)
Definition: ad_load.cxx:1363
char * oname
Definition: readseq.c:467
char * GB_read_fp(FILE *in)
Definition: adsocket.cxx:271
static export_sequence_data * esd
Definition: seq_export.cxx:477
void GB_warning(const char *message)
Definition: arb_msg.cxx:530
GB_ERROR detectVerticalGaps()
Definition: seq_export.cxx:327
GBDATA * GBT_first_marked_species(GBDATA *gb_main)
Definition: aditem.cxx:113
GBDATA * GB_child(GBDATA *father)
Definition: adquery.cxx:322
return string(buffer, length)
char * GBS_string_eval_in_env(const char *insource, const char *icommand, const GBL_call_env &callEnv)
Definition: admatch.cxx:493
long GBT_mark_all(GBDATA *gb_main, int flag)
Definition: aditem.cxx:295
static gb_export_sequence_cb get_export_sequence
Definition: adlang1.cxx:33
void GB_unlink_or_warn(const char *path, GB_ERROR *error)
Definition: arb_file.cxx:206
NOT4PERL void GB_set_export_sequence_hook(gb_export_sequence_cb escb)
Definition: adlang1.cxx:35
bool isGap(char c)
bool read_string_pair(FILE *in, char *&s1, char *&s2, size_t &lineNr)
Definition: seqio.cxx:37
GB_ERROR GB_IO_error(const char *action, const char *filename)
Definition: arb_msg.cxx:285
char * ARB_strdup(const char *str)
Definition: arb_string.h:27
GBDATA * get_gb_main() const
Definition: seq_export.cxx:281
char * GB_read_as_string(GBDATA *gbd)
Definition: arbdb.cxx:1060
const char * ARB_date_string()
Definition: arb_string.cxx:35
bool hasError() const
Definition: ErrorOrType.h:50
TYPE getValue() const
Definition: ErrorOrType.h:58
const char * GBS_global_string(const char *templat,...)
Definition: arb_msg.cxx:203
long GBT_get_alignment_len(GBDATA *gb_main, const char *aliname)
Definition: adali.cxx:833
bool GB_have_error()
Definition: arb_msg.cxx:338
char * GBS_string_eval(const char *insource, const char *icommand)
Definition: admatch.cxx:699
GBDATA * select_first(GBDATA *gb_main) const
Definition: seq_export.cxx:188
void set_single_mode(GBDATA *gb_species)
Definition: seq_export.cxx:283
void auto_subtitles(const char *prefix)
Definition: arb_progress.h:344
SpeciesSelector(ExportWhich which_, const char *one_species_)
Definition: seq_export.cxx:184
SmartCharPtr description
Definition: seqio.hxx:54
static EXPORT_CMD check_internal(const char *command)
Definition: seq_export.cxx:51
GBDATA * GB_get_father(GBDATA *gbd)
Definition: arbdb.cxx:1722
GB_CSTR GB_canonical_path(const char *anypath)
Definition: adsocket.cxx:1000
FILE * seq
Definition: rns.c:46
EXPORT_CMD
Definition: seq_export.cxx:38
GB_CSTR GBS_find_string(GB_CSTR cont, GB_CSTR substr, int match_mode)
Definition: admatch.cxx:103
static GB_ERROR export_format_single(const char *db_name, const char *formname, const char *outname, char **resulting_outname, RuleSetPtr ruleset)
Definition: seq_export.cxx:583
static const char * internal_export_commands[]
Definition: seq_export.cxx:46
GB_ERROR GB_await_error()
Definition: arb_msg.cxx:342
EXPORT_CMD export_mode
Definition: seq_export.cxx:71
char * GB_create_tempfile(const char *name)
Definition: adsocket.cxx:1222
long GB_read_count(GBDATA *gbd)
Definition: arbdb.cxx:758
Definition: arbdb.h:78
GB_TYPES GB_read_type(GBDATA *gbd)
Definition: arbdb.cxx:1643
static GB_ERROR export_species_using_form(FILE *out, const char *form, const GBL_call_env &callEnv)
Definition: seq_export.cxx:543
GB_ERROR deliver() const
Definition: arb_error.h:116
void GB_warningf(const char *templat,...)
Definition: arb_msg.cxx:536
GB_CSTR GB_read_key_pntr(GBDATA *gbd)
Definition: arbdb.cxx:1656
bool isSet() const
test if SmartPtr is not NULp
Definition: smartptr.h:245
char * pre_format
Definition: seq_export.cxx:66
Generic smart pointer.
Definition: smartptr.h:149
static GB_ERROR export_format_multiple(const char *dbname, const char *formname, const char *outname, bool multiple, char **resulting_outname, RuleSetPtr ruleset)
Definition: seq_export.cxx:709
static GB_ERROR XML_recursive(GBDATA *gbd, int depth)
Definition: seq_export.cxx:484
#define TEST_REJECT(cond)
Definition: test_unit.h:1330
#define TEST_REJECT_NULL(n)
Definition: test_unit.h:1325
static void error(const char *msg)
Definition: mkptypes.cxx:96
GB_CSTR GB_path_in_ARBHOME(const char *relative_path)
Definition: adsocket.cxx:1149
size_t get_length() const
Definition: AP_filter.hxx:83
GBDATA * GBT_next_marked_species(GBDATA *gb_species)
Definition: aditem.cxx:116
GB_ERROR get_exportFormat_information(const char *eft_formname, ExportFormatInfo &info)
Definition: seq_export.cxx:812
const char * get_export_sequence(GBDATA *gb_species, size_t &seq_len, GB_ERROR &error)
Definition: seq_export.cxx:411
GB_ERROR export_by_format(GBDATA *gb_main, ExportWhich which, const char *one_species, AP_filter *filter, int cut_stop_codon, int compress, const char *dbname, const char *formname, const char *field_transfer_set, const char *outname, int multiple, char **real_outname)
Definition: seq_export.cxx:763
GB_alignment_type GBT_get_alignment_type(GBDATA *gb_main, const char *aliname)
Definition: adali.cxx:878
#define TEST_EXPECT_ZERO_OR_SHOW_ERRNO(iocond)
Definition: test_unit.h:1090
bool GB_is_container(GBDATA *gbd)
Definition: arbdb.cxx:1648
long getAliLen() const
Definition: seq_export.cxx:280
#define TEST_EXPECT_TEXTFILE_DIFFLINES_IGNORE_DATES(fgot, fwant, diff)
Definition: test_unit.h:1419
GB_CSTR GB_path_in_ARBLIB(const char *relative_path)
Definition: adsocket.cxx:1156
static GB_ERROR read_export_format(export_format *efo, const char *file, bool load_complete_form)
Definition: seq_export.cxx:90
void GBS_read_dir(StrArray &names, const char *dir, const char *mask)
Definition: adfile.cxx:213
GBDATA * GBT_find_sequence(GBDATA *gb_species, const char *aliname)
Definition: adali.cxx:708
GB_alignment_type
Definition: arbdb_base.h:61
a xml text node
Definition: xml.hxx:122
GB_CSTR GB_append_suffix(const char *name, const char *suffix)
Definition: adsocket.cxx:984
char * GB_unique_filename(const char *name_prefix, const char *suffix)
Definition: adsocket.cxx:1233
void appendTo(char *&content, char sep, char *&toAppend)
Definition: seqio.hxx:34
fputs(TRACE_PREFIX, stderr)
GB_CSTR GB_concat_path(GB_CSTR anypath_left, GB_CSTR anypath_right)
Definition: adsocket.cxx:1069
SmartCharPtr suffix
Definition: seqio.hxx:53
void GB_write_flag(GBDATA *gbd, long flag)
Definition: arbdb.cxx:2773
export_sequence_data(GBDATA *Gb_Main, ExportWhich which, const char *one_species, AP_filter *Filter, bool CutStopCodon, int Compress)
Definition: seq_export.cxx:232
#define sio_assert(cond)
Definition: seq_export.cxx:29
GBDATA * first_species() const
Definition: seq_export.cxx:286
const struct formatTable formname[]
GB_ERROR GB_failedTo_error(const char *do_something, const char *special, GB_ERROR error)
Definition: arb_msg.cxx:375
GBDATA * GBT_first_species(GBDATA *gb_main)
Definition: aditem.cxx:124
GB_ERROR is_invalid() const
Definition: AP_filter.hxx:123
char * get_exportFormat_evalForm(const char *eft_formname, GB_ERROR &error)
Definition: seq_export.cxx:830
const char * GBS_static_string(const char *str)
Definition: arb_msg.cxx:212
#define TEST_EXPECT_NO_ERROR(call)
Definition: test_unit.h:1118
unsigned char uchar
Definition: gde.hxx:21
void GB_split_full_path(const char *fullpath, char **res_dir, char **res_fullname, char **res_name_only, char **res_suffix)
Definition: adsocket.cxx:1268
bool is_std_gap(const char c)
GBDATA * GBT_next_species(GBDATA *gb_species)
Definition: aditem.cxx:128
#define NULp
Definition: cxxforward.h:116
bool GB_is_regularfile(const char *path)
Definition: arb_file.cxx:76
GBDATA * GBT_find_species(GBDATA *gb_main, const char *name)
Definition: aditem.cxx:139
void GB_informationf(const char *templat,...)
Definition: arb_msg.cxx:555
const char * getAlignment() const
Definition: seq_export.cxx:279
static char * command
Definition: arb_a2ps.c:319
bool in_single_mode() const
Definition: seq_export.cxx:284
char * GBT_get_default_alignment(GBDATA *gb_main)
Definition: adali.cxx:747
ARB_ERROR getError() const
Definition: ErrorOrType.h:53
GBDATA * GB_nextChild(GBDATA *child)
Definition: adquery.cxx:326
GB_transaction ta(gb_var)
GB_CSTR GB_read_char_pntr(GBDATA *gbd)
Definition: arbdb.cxx:904
GBDATA * gb_main
Definition: adname.cxx:32
GB_CSTR GBT_get_name_or_description(GBDATA *gb_item)
Definition: aditem.cxx:459
static int info[maxsites+1]
static const char * exported_sequence(GBDATA *gb_species, size_t *seq_len, GB_ERROR *error)
Definition: seq_export.cxx:479
GBDATA * next_species(GBDATA *gb_prev) const
Definition: seq_export.cxx:287
const char * GBT_read_char_pntr(GBDATA *gb_container, const char *fieldpath)
Definition: adtools.cxx:307
#define TEST_EXPECT_EQUAL(expr, want)
Definition: test_unit.h:1294
static GB_ERROR export_write_species(GBDATA *gb_species, FILE *out, const GBL_env &env, const export_format &efo)
Definition: seq_export.cxx:563
void inc_and_check_user_abort(GB_ERROR &error)
Definition: arb_progress.h:332
char * GBS_global_string_copy(const char *templat,...)
Definition: arb_msg.cxx:194
void GB_close(GBDATA *gbd)
Definition: arbdb.cxx:655
char * description
Definition: seq_export.cxx:68
bool GB_is_privatefile(const char *path, bool read_private)
Definition: arb_file.cxx:124
const unsigned char * get_seq_data(GBDATA *gb_species, size_t &slen, GB_ERROR &error) const
Definition: seq_export.cxx:310