ARB
arb_export_sequences.cxx
Go to the documentation of this file.
1 #include <stdio.h> // needed for arbdb.h
2 #include <string.h>
3 #include <unistd.h>
4 
5 #include <arbdb.h>
6 #include <arbdbt.h>
7 #include <arb_file.h>
8 #include <AP_filter.hxx>
9 #include <seqio.hxx>
10 #include <insdel.h>
11 
12 #include <string>
13 #include <vector>
14 #include <fstream>
15 #include <map>
16 #include <iostream>
17 #include <sstream>
18 #include <set>
19 #include <iterator>
20 #include <algorithm>
21 
22 using std::cerr;
23 using std::endl;
24 using std::clog;
25 using std::cout;
26 using std::string;
27 using std::map;
28 using std::ifstream;
29 using std::ofstream;
30 using std::vector;
31 using std::stringstream;
32 using std::getline;
33 using std::set;
34 
35 template<typename T>
36 struct equals {
37  const T& t;
38  equals(const T& _t) : t(_t) {}
39  bool operator()(const T& _t) { return _t == t; }
40 };
41 
42 /* fake aw_message/aw_status functions */
43 
44 static void aw_status(double d) {
45  cerr << "aw_status_fraction: " << d << endl;
46  cerr.flush();
47 }
48 
49 /* generic writer class */
50 class Writer : virtual Noncopyable {
51  string err_state;
52 protected:
53  virtual void set_error(const char *error) {
54  arb_assert(error);
55  if (!err_state.empty()) {
56  cerr << "ERROR: " << err_state << endl;
57  }
58  err_state = error;
59  }
60 
61 public:
62  virtual ~Writer() {}
63 
64  virtual void addSequence(GBDATA*) = 0;
65  virtual void finish() {}
66 
67  const char *get_error() const {
68  return err_state.empty() ? NULp : err_state.c_str();
69  }
70  bool ok() const { return !get_error(); }
71 };
72 
73 class MultiFastaWriter : public Writer { // derived from Noncopyable
74  ofstream file;
75  string default_ali;
76  double count, count_max;
77 public:
78  MultiFastaWriter(string s, const char* a, int c)
79  : file(s.c_str()),
80  default_ali(a),
81  count(0), count_max(c)
82  {}
83 
84  string readString(GBDATA* spec, const char* key) {
85  GBDATA *gbd = GB_find(spec, key, SEARCH_CHILD);
86  if (!gbd) return string("<empty>");
87  char *str= GB_read_as_string(gbd);
88  if (!str) return string("<empty>");
89  string rval = str;
90  free(str);
91  return rval;
92  }
93 
94  string readData(GBDATA* spec) {
95  GBDATA *gbd = GB_find(spec, default_ali.c_str(), SEARCH_CHILD);
96  if (!gbd) return string("<empty>");
97  string result = readString(gbd, "data");
98  GB_release(gbd);
99  return result;
100  }
101 
103  file << ">"
104  << readString(spec, "acc") << "."
105  << readString(spec, "start") << "."
106  << readString(spec, "stop") << " | "
107  << readString(spec, "full_name") << "\n";
108 
109  string tmp = readData(spec);
110  std::replace_if(tmp.begin(),tmp.end(),equals<char>('.'),'-');
111 
112  unsigned int i;
113  for (i=0; i+70<tmp.size(); i+=70)
114  file << tmp.substr(i,70) << "\n";
115  file << tmp.substr(i) << "\n";
116  aw_status(++count/count_max);
117  }
118 };
119 
120 class ArbWriter FINAL_TYPE : public Writer { // derived from Noncopyable
121  string template_arb;
122  string filename;
123  double count, count_max;
124  GBDATA *gbdst, *gbdst_spec;
125 
126  void close() {
127  if (gbdst) {
128  GB_close(gbdst);
129  gbdst = NULp;
130  }
131  }
132 
133  void set_error(const char *error) OVERRIDE {
135  close();
136  }
137 
138 public:
139  ArbWriter(string ta, string f, int c)
140  : template_arb(ta),
141  filename(f),
142  count(0),
143  count_max(c)
144  {
145  gbdst = GB_open(template_arb.c_str(), "rw");
146  if (!gbdst) set_error("Unable to open template file");
147  else {
148  if (GB_request_undo_type(gbdst, GB_UNDO_NONE)) {
149  cerr << "WARN: Unable to disable undo buffer";
150  }
151 
152  GB_begin_transaction(gbdst);
153  gbdst_spec = GB_search(gbdst, "species_data", GB_CREATE_CONTAINER);
154  GB_commit_transaction(gbdst);
155 
156  GB_ERROR error = GB_save_as(gbdst, filename.c_str(), "b");
157  if (error) set_error(GBS_global_string("Unable to open output file (Reason: %s)", error));
158  }
159  }
161  close();
162  if (!ok()) {
163  unlink(filename.c_str());
164  }
165  }
166 
167  void addSequence(GBDATA* gbspec) OVERRIDE {
168  if (gbdst) {
169  GB_transaction ta(gbdst);
170  GBDATA *gbnew = GB_create_container(gbdst_spec, "species");
171  GB_ERROR error = NULp;
172  if (!gbnew) {
173  error = GB_await_error();
174  }
175  else {
176  error = GB_copy_dropProtectMarksAndTempstate(gbnew, gbspec);
177  if (!error) error = GB_release(gbnew);
178  }
179  if (error) set_error(error);
180  }
181  aw_status(++count/count_max);
182  }
183 
184  void finish() OVERRIDE {
185  GB_ERROR error = NULp;
186  if (gbdst) {
187  cerr << "Compressing..." << endl;
188  GB_transaction trans(gbdst);
189  char *ali = GBT_get_default_alignment(gbdst);
190  if (!ali) {
191  GB_clear_error();
192  error = "Your template DB has no default alignment";
193  }
194  else {
195  error = ARB_format_alignment(gbdst, ali);
196  if (error) error = GBS_global_string("Failed to format alignments (Reason: %s)", error);
197  free(ali);
198  }
199  }
200 
201  if (!error && GB_save_as(gbdst, filename.c_str(), "b")) {
202  error = "Unable to save to output file";
203  }
204 
205  if (error) set_error(error);
206  }
207 };
208 
209 class AwtiExportWriter FINAL_TYPE : public Writer { // derived from Noncopyable
210  GBDATA *gbmain;
211  const char *dbname;
212  const char *formname; // export form (.eft)
213  const char *outname;
214  const int compress;
215 
216 public:
217  AwtiExportWriter(GBDATA *_gbmain, const char *_dbname, const char* eft,
218  const char* out, int c)
219  : gbmain(_gbmain), dbname(_dbname), formname(eft), outname(out), compress(c)
220  {
221  GBT_mark_all(gbmain, 0);
222  }
223  void addSequence(GBDATA *gbspec) OVERRIDE {
224  GB_write_flag(gbspec, 1);
225  }
226 
227  void finish() OVERRIDE {
228  const int cut_stop_codon = 0;
229  const int multiple = 0;
230 
231  GB_ERROR err;
232  char *aliname = GBT_get_default_alignment(gbmain);
233  if (!aliname) {
234  err = GB_await_error();
235  }
236  else {
237  int aliLen = GBT_get_alignment_len(gbmain, aliname);
238  if (aliLen<=0) {
239  err = GB_await_error();
240  }
241  else {
242  AP_filter filter(aliLen);
243  char *real_outname = NULp;
244 
246  &filter, cut_stop_codon, compress,
247  dbname, formname, NULp, // @@@ pass name of specified FTS here
248  outname, multiple, &real_outname);
249  free(real_outname);
250  }
251  free(aliname);
252  }
253  if (err) set_error(err);
254  }
255 };
256 
257 
258 // @@@ add new parameter allowing to pass FTS (e.g. --fts <FILE>)!
259 
260 static void help() {
261  cerr <<
262  "Exports sequences from an ARB database."
263  "\n"
264  "Parameters:\n"
265  " --source <FILE> source file\n"
266  " --dest <FILE> destination file\n"
267  " --format <ARG> 'ARB', 'FASTA' or path to ARB EFT file\n"
268  " --accs <FILE> export only sequences matching these accessions\n"
269  "\n"
270  "only for EFT output format:\n"
271  " --eft-compress-gaps <ARG> 'none', 'vertical' or 'all' (default none)\n"
272  "\n"
273  "required for ARB output format:\n"
274  " --arb-template <FILE> template (empty) arb file\n"
275 
276  << endl;;
277 }
278 
279 static set<string> read_accession_file(const char* file) {
280  set<string> accessions;
281  ifstream ifs(file);
282  string linestring;
283 
284  do {
285  getline(ifs,linestring);
286  if (!linestring.empty()) {
287  stringstream line(linestring);
288  string item;
289  do {
290  getline(line,item,',');
291  item.erase(item.find_last_not_of(" \n,\t")+1);
292  item.erase(0,item.find_first_not_of(" \n,\t"));
293  if (!item.empty())
294  accessions.insert(item);
295  } while(line.good());
296  }
297  } while(ifs.good());
298 
299  return accessions;
300 }
301 
302 int main(int argc, char** argv) {
303  enum {
304  FMT_ARB,
305  FMT_FASTA,
306  FMT_EFT,
307  FMT_NONE
308  } format = FMT_NONE;
309 
310  char *src = NULp;
311  char *dest = NULp;
312  char *tmpl = NULp;
313  char *accs = NULp;
314  char *eft = NULp;
315  int compress = 0;
316  GB_shell shell;
317 
318  if (argc <2) {
319  help();
320  return 0;
321  }
322 
323  for (int i = 1; i < argc; i++) {
324  if (argc > i+1) {
325  if (!strcmp(argv[i], "--source")) {
326  src = argv[++i];
327  } else if (!strcmp(argv[i], "--dest")) {
328  dest = argv[++i];
329  } else if (!strcmp(argv[i], "--arb-template")) {
330  tmpl = argv[++i];
331  } else if (!strcmp(argv[i], "--accs")) {
332  accs = argv[++i];
333  } else if (!strcmp(argv[i], "--format")) {
334  if (!strcmp(argv[++i], "ARB")) {
335  format = FMT_ARB;
336  } else if (!strcmp(argv[i], "FASTA")) {
337  format = FMT_FASTA;
338  } else {
339  format = FMT_EFT;
340  eft = argv[i];
341  }
342  } else if (!strcmp(argv[i], "--eft-compress-gaps")) {
343  if (!strcmp(argv[++i], "none")) {
344  compress = 0;
345  } else if (!strcmp(argv[i], "vertical")) {
346  compress = 1;
347  } else if (!strcmp(argv[i], "all")) {
348  compress = 2;
349  } else {
350  cerr << "missing argument to --eft-compress-gaps?";
351  i--;
352  }
353  } else {
354  cerr << "ignoring malformed argument: '" << argv[i] << "'" << endl;
355  }
356  } else {
357  cerr << "ignoring malformed argument: '" << argv[i] << "'" << endl;
358  }
359  }
360 
361  if (!src) {
362  cerr << "need '--source' parameter" << endl;
363  return 1;
364  }
365  if (!dest) {
366  cerr << "need '--dest' parameter" << endl;
367  return 1;
368  }
369  if (format == FMT_NONE) {
370  cerr << "need '--format' parameter" << endl;
371  return 1;
372  }
373  if (format == FMT_ARB && !tmpl) {
374  cerr << "need '--arb-template' parameter for output type ARB'" << endl;
375  return 1;
376  }
377 
378  set<string> accessions;
379  if (accs) {
380  if (!GB_is_regularfile(accs)) {
381  cerr << "unable to open accession number file '" << accs << '\'' << endl;
382  return 1;
383  }
384  accessions = read_accession_file(accs);
385  }
386 
387  GB_ERROR error = NULp;
388  int exitcode = 0;
389 
390  GBDATA *gbsrc = GB_open(src, "r");
391  if (!gbsrc) {
392  error = "unable to open source file";
393  exitcode = 1;
394  }
395 
396  if (!error) {
397  GB_transaction trans(gbsrc);
398 
399  int count_max = 0;
400  if (accs) {
401  count_max = accessions.size();
402  } else {
403  for (GBDATA *gbspec = GBT_first_species(gbsrc);
404  gbspec; gbspec = GBT_next_species(gbspec)) {
405  count_max++;
406  }
407  }
408 
409  // create writer for target type
410  Writer *writer = NULp;
411  switch(format) {
412  case FMT_ARB:
413  writer = new ArbWriter(tmpl, dest, count_max);
414  break;
415  case FMT_FASTA: {
416  char *default_ali = GBT_get_default_alignment(gbsrc);
417  if (!default_ali) {
418  error = GB_await_error();
419  }
420  else {
421  writer = new MultiFastaWriter(dest, default_ali, count_max);
422  free(default_ali);
423  }
424  break;
425  }
426  case FMT_EFT:
427  writer = new AwtiExportWriter(gbsrc, src, eft, dest, compress);
428  break;
429 
430  case FMT_NONE:
431  error = "internal error";
432  exitcode = 2;
433  break;
434  }
435 
436  if (writer) {
437  // generate file
438  for (GBDATA *gbspec = GBT_first_species(gbsrc);
439  gbspec && writer->ok();
440  gbspec = GBT_next_species(gbspec))
441  {
442  GBDATA *gbname = GB_find(gbspec, "acc", SEARCH_CHILD);
443  string name(GB_read_char_pntr(gbname));
444 
445  if (!accs || accessions.count(name)) {
446  writer->addSequence(gbspec);
447  }
448  GB_release(gbspec);
449  }
450 
451  if (writer->ok()) writer->finish();
452 
453  if (!writer->ok()) {
454  error = GBS_static_string(writer->get_error());
455  exitcode = 5;
456  }
457  delete writer;
458  }
459  }
460  if (gbsrc) GB_close(gbsrc);
461 
462  if (error) cerr << "ERROR: " << error << endl;
463  return exitcode;
464 }
465 
GB_ERROR GB_begin_transaction(GBDATA *gbd)
Definition: arbdb.cxx:2528
GB_ERROR GB_copy_dropProtectMarksAndTempstate(GBDATA *dest, GBDATA *source)
Definition: arbdb.cxx:2152
#define arb_assert(cond)
Definition: arb_assert.h:245
string result
GBDATA * GB_open(const char *path, const char *opent)
Definition: ad_load.cxx:1363
GB_ERROR GB_commit_transaction(GBDATA *gbd)
Definition: arbdb.cxx:2551
virtual bool ok() const =0
string readData(GBDATA *spec)
AliDataPtr format(AliDataPtr data, const size_t wanted_len, GB_ERROR &error)
Definition: insdel.cxx:615
return string(buffer, length)
void addSequence(GBDATA *gbspec) OVERRIDE
GBDATA * GB_find(GBDATA *gbd, const char *key, GB_SEARCH_TYPE gbs)
Definition: adquery.cxx:295
long GBT_mark_all(GBDATA *gb_main, int flag)
Definition: aditem.cxx:295
virtual void addSequence(GBDATA *)=0
static void help()
AwtiExportWriter(GBDATA *_gbmain, const char *_dbname, const char *eft, const char *out, int c)
char * GB_read_as_string(GBDATA *gbd)
Definition: arbdb.cxx:1060
static set< string > read_accession_file(const char *file)
ArbWriter(string ta, string f, int c)
bool ok() const
const char * GBS_global_string(const char *templat,...)
Definition: arb_msg.cxx:203
long GBT_get_alignment_len(GBDATA *gb_main, const char *aliname)
Definition: adali.cxx:833
void addSequence(GBDATA *spec) OVERRIDE
Definition: reader.h:95
GB_ERROR GB_release(GBDATA *gbd)
Definition: arbdb.cxx:2600
GB_ERROR GB_await_error()
Definition: arb_msg.cxx:342
GBDATA * GB_create_container(GBDATA *father, const char *key)
Definition: arbdb.cxx:1829
GB_ERROR ARB_format_alignment(GBDATA *Main, const char *alignment_name)
Definition: insdel.cxx:1439
void GB_clear_error()
Definition: arb_msg.cxx:354
GB_ERROR GB_save_as(GBDATA *gbd, const char *path, const char *savetype)
static void error(const char *msg)
Definition: mkptypes.cxx:96
char * str
Definition: defines.h:20
equals(const T &_t)
GB_ERROR export_by_format(GBDATA *gb_main, ExportWhich which, const char *one_species, AP_filter *filter, int cut_stop_codon, int compress, const char *dbname, const char *formname, const char *field_transfer_set, const char *outname, int multiple, char **real_outname)
Definition: seq_export.cxx:763
virtual ~Writer()
int main(int argc, char **argv)
static void aw_status(double d)
virtual void finish()
string readString(GBDATA *spec, const char *key)
void finish() OVERRIDE
xml element
void GB_write_flag(GBDATA *gbd, long flag)
Definition: arbdb.cxx:2773
#define OVERRIDE
Definition: cxxforward.h:112
const struct formatTable formname[]
GBDATA * GBT_first_species(GBDATA *gb_main)
Definition: aditem.cxx:124
MultiFastaWriter(string s, const char *a, int c)
const char * GBS_static_string(const char *str)
Definition: arb_msg.cxx:212
static int line
Definition: arb_a2ps.c:296
GBDATA * GBT_next_species(GBDATA *gb_species)
Definition: aditem.cxx:128
virtual void set_error(const char *error)
#define NULp
Definition: cxxforward.h:116
bool GB_is_regularfile(const char *path)
Definition: arb_file.cxx:76
const char * get_error() const
char * GBT_get_default_alignment(GBDATA *gb_main)
Definition: adali.cxx:747
Definition: trnsprob.h:20
GB_ERROR GB_request_undo_type(GBDATA *gb_main, GB_UNDO_TYPE type) __ATTR__USERESULT_TODO
Definition: adindex.cxx:721
GB_transaction ta(gb_var)
GB_CSTR GB_read_char_pntr(GBDATA *gbd)
Definition: arbdb.cxx:904
GBDATA * GB_search(GBDATA *gbd, const char *fieldpath, GB_TYPES create)
Definition: adquery.cxx:531
void GB_close(GBDATA *gbd)
Definition: arbdb.cxx:655
bool operator()(const T &_t)
GB_write_int const char s
Definition: AW_awar.cxx:154