ARB
arb_export_sequences.cxx
Go to the documentation of this file.
1 #include <stdio.h> // needed for arbdb.h
2 #include <string.h>
3 #include <unistd.h>
4 
5 #include <arbdb.h>
6 #include <arbdbt.h>
7 #include <arb_file.h>
8 #include <AP_filter.hxx>
9 #include <seqio.hxx>
10 #include <insdel.h>
11 
12 #include <string>
13 #include <vector>
14 #include <fstream>
15 #include <map>
16 #include <iostream>
17 #include <sstream>
18 #include <set>
19 #include <iterator>
20 #include <algorithm>
21 
22 using std::cerr;
23 using std::endl;
24 using std::clog;
25 using std::cout;
26 using std::string;
27 using std::map;
28 using std::ifstream;
29 using std::ofstream;
30 using std::vector;
31 using std::stringstream;
32 using std::getline;
33 using std::set;
34 
35 template<typename T>
36 struct equals {
37  const T& t;
38  equals(const T& _t) : t(_t) {}
39  bool operator()(const T& _t) { return _t == t; }
40 };
41 
42 /* fake aw_message/aw_status functions */
43 
44 static void aw_status(double d) {
45  cerr << "aw_status_fraction: " << d << endl;
46  cerr.flush();
47 }
48 
49 /* generic writer class */
50 class Writer : virtual Noncopyable {
51  string err_state;
52 protected:
53  virtual void set_error(const char *error) {
54  arb_assert(error);
55  if (!err_state.empty()) {
56  cerr << "ERROR: " << err_state << endl;
57  }
58  err_state = error;
59  }
60 
61 public:
62  virtual ~Writer() {}
63 
64  virtual void addSequence(GBDATA*) = 0;
65  virtual void finish() {}
66 
67  const char *get_error() const {
68  return err_state.empty() ? NULp : err_state.c_str();
69  }
70  bool ok() const { return !get_error(); }
71 };
72 
73 class MultiFastaWriter : public Writer { // derived from Noncopyable
74  ofstream file;
75  string default_ali;
76  double count, count_max;
77 public:
78  MultiFastaWriter(string s, const char* a, int c)
79  : file(s.c_str()),
80  default_ali(a),
81  count(0), count_max(c)
82  {}
83 
84  string readString(GBDATA* spec, const char* key) {
85  GBDATA *gbd = GB_find(spec, key, SEARCH_CHILD);
86  if (!gbd) return string("<empty>");
87  char *str= GB_read_as_string(gbd);
88  if (!str) return string("<empty>");
89  string rval = str;
90  free(str);
91  return rval;
92  }
93 
94  string readData(GBDATA* spec) {
95  GBDATA *gbd = GB_find(spec, default_ali.c_str(), SEARCH_CHILD);
96  if (!gbd) return string("<empty>");
97  string result = readString(gbd, "data");
98  GB_release(gbd);
99  return result;
100  }
101 
103  file << ">"
104  << readString(spec, "acc") << "."
105  << readString(spec, "start") << "."
106  << readString(spec, "stop") << " | "
107  << readString(spec, "full_name") << "\n";
108 
109  string tmp = readData(spec);
110  std::replace_if(tmp.begin(),tmp.end(),equals<char>('.'),'-');
111 
112  unsigned int i;
113  for (i=0; i+70<tmp.size(); i+=70)
114  file << tmp.substr(i,70) << "\n";
115  file << tmp.substr(i) << "\n";
116  aw_status(++count/count_max);
117  }
118 };
119 
120 class ArbWriter FINAL_TYPE : public Writer { // derived from Noncopyable
121  string template_arb;
122  string filename;
123  double count, count_max;
124  GBDATA *gbdst, *gbdst_spec;
125 
126  void close() {
127  if (gbdst) {
128  GB_close(gbdst);
129  gbdst = NULp;
130  }
131  }
132 
133  void set_error(const char *error) OVERRIDE {
135  close();
136  }
137 
138 public:
139  ArbWriter(string ta, string f, int c)
140  : template_arb(ta),
141  filename(f),
142  count(0),
143  count_max(c)
144  {
145  gbdst = GB_open(template_arb.c_str(), "rw");
146  if (!gbdst) set_error("Unable to open template file");
147  else {
148  if (GB_request_undo_type(gbdst, GB_UNDO_NONE)) {
149  cerr << "WARN: Unable to disable undo buffer";
150  }
151 
152  GB_begin_transaction(gbdst);
153  gbdst_spec = GB_search(gbdst, "species_data", GB_CREATE_CONTAINER);
154  GB_commit_transaction(gbdst);
155 
156  GB_ERROR error = GB_save_as(gbdst, filename.c_str(), "b");
157  if (error) set_error(GBS_global_string("Unable to open output file (Reason: %s)", error));
158  }
159  }
161  close();
162  if (!ok()) {
163  unlink(filename.c_str());
164  }
165  }
166 
167  void addSequence(GBDATA* gbspec) OVERRIDE {
168  if (gbdst) {
169  GB_transaction ta(gbdst);
170  GBDATA *gbnew = GB_create_container(gbdst_spec, "species");
171  GB_ERROR error = NULp;
172  if (!gbnew) {
173  error = GB_await_error();
174  }
175  else {
176  error = GB_copy_dropProtectMarksAndTempstate(gbnew, gbspec);
177  if (!error) error = GB_release(gbnew);
178  }
179  if (error) set_error(error);
180  }
181  aw_status(++count/count_max);
182  }
183 
184  void finish() OVERRIDE {
185  GB_ERROR error = NULp;
186  if (gbdst) {
187  cerr << "Compressing..." << endl;
188  GB_transaction trans(gbdst);
189  char *ali = GBT_get_default_alignment(gbdst);
190  if (!ali) {
191  error = "Your template DB has no default alignment";
192  }
193  else {
194  error = ARB_format_alignment(gbdst, ali);
195  if (error) error = GBS_global_string("Failed to format alignments (Reason: %s)", error);
196  free(ali);
197  }
198  }
199 
200  if (!error && GB_save_as(gbdst, filename.c_str(), "b")) {
201  error = "Unable to save to output file";
202  }
203 
204  if (error) set_error(error);
205  }
206 };
207 
208 class AwtiExportWriter FINAL_TYPE : public Writer { // derived from Noncopyable
209  GBDATA *gbmain;
210  const char *dbname;
211  const char *formname; // export form (.eft)
212  const char *outname;
213  const int compress;
214 
215 public:
216  AwtiExportWriter(GBDATA *_gbmain, const char *_dbname, const char* eft,
217  const char* out, int c)
218  : gbmain(_gbmain), dbname(_dbname), formname(eft), outname(out), compress(c)
219  {
220  GBT_mark_all(gbmain, 0);
221  }
222  void addSequence(GBDATA *gbspec) OVERRIDE {
223  GB_write_flag(gbspec, 1);
224  }
225 
226  void finish() OVERRIDE {
227  const int cut_stop_codon = 0;
228  const int multiple = 0;
229 
230  char *aliname = GBT_get_default_alignment(gbmain);
231  AP_filter filter(GBT_get_alignment_len(gbmain, aliname));
232  char *real_outname = NULp;
233 
235  &filter, cut_stop_codon, compress,
236  dbname, formname, NULp, // @@@ pass name of specified FTS here
237  outname, multiple, &real_outname);
238  if (err) set_error(err);
239 
240  free(real_outname);
241  free(aliname);
242  }
243 };
244 
245 
246 // @@@ add new parameter allowing to pass FTS (e.g. --fts <FILE>)!
247 
248 static void help() {
249  cerr <<
250  "Exports sequences from an ARB database."
251  "\n"
252  "Parameters:\n"
253  " --source <FILE> source file\n"
254  " --dest <FILE> destination file\n"
255  " --format <ARG> 'ARB', 'FASTA' or path to ARB EFT file\n"
256  " --accs <FILE> export only sequences matching these accessions\n"
257  "\n"
258  "only for EFT output format:\n"
259  " --eft-compress-gaps <ARG> 'none', 'vertical' or 'all' (default none)\n"
260  "\n"
261  "required for ARB output format:\n"
262  " --arb-template <FILE> template (empty) arb file\n"
263 
264  << endl;;
265 }
266 
267 static set<string> read_accession_file(const char* file) {
268  set<string> accessions;
269  ifstream ifs(file);
270  string linestring;
271 
272  do {
273  getline(ifs,linestring);
274  if (!linestring.empty()) {
275  stringstream line(linestring);
276  string item;
277  do {
278  getline(line,item,',');
279  item.erase(item.find_last_not_of(" \n,\t")+1);
280  item.erase(0,item.find_first_not_of(" \n,\t"));
281  if (!item.empty())
282  accessions.insert(item);
283  } while(line.good());
284  }
285  } while(ifs.good());
286 
287  return accessions;
288 }
289 
290 int main(int argc, char** argv) {
291  enum {
292  FMT_ARB,
293  FMT_FASTA,
294  FMT_EFT,
295  FMT_NONE
296  } format = FMT_NONE;
297 
298  char *src = NULp;
299  char *dest = NULp;
300  char *tmpl = NULp;
301  char *accs = NULp;
302  char *eft = NULp;
303  int compress = 0;
304  GB_shell shell;
305 
306  if (argc <2) {
307  help();
308  return 0;
309  }
310 
311  for (int i = 1; i < argc; i++) {
312  if (argc > i+1) {
313  if (!strcmp(argv[i], "--source")) {
314  src = argv[++i];
315  } else if (!strcmp(argv[i], "--dest")) {
316  dest = argv[++i];
317  } else if (!strcmp(argv[i], "--arb-template")) {
318  tmpl = argv[++i];
319  } else if (!strcmp(argv[i], "--accs")) {
320  accs = argv[++i];
321  } else if (!strcmp(argv[i], "--format")) {
322  if (!strcmp(argv[++i], "ARB")) {
323  format = FMT_ARB;
324  } else if (!strcmp(argv[i], "FASTA")) {
325  format = FMT_FASTA;
326  } else {
327  format = FMT_EFT;
328  eft = argv[i];
329  }
330  } else if (!strcmp(argv[i], "--eft-compress-gaps")) {
331  if (!strcmp(argv[++i], "none")) {
332  compress = 0;
333  } else if (!strcmp(argv[i], "vertical")) {
334  compress = 1;
335  } else if (!strcmp(argv[i], "all")) {
336  compress = 2;
337  } else {
338  cerr << "missing argument to --eft-compress-gaps?";
339  i--;
340  }
341  } else {
342  cerr << "ignoring malformed argument: '" << argv[i] << "'" << endl;
343  }
344  } else {
345  cerr << "ignoring malformed argument: '" << argv[i] << "'" << endl;
346  }
347  }
348 
349  if (!src) {
350  cerr << "need '--source' parameter" << endl;
351  return 1;
352  }
353  if (!dest) {
354  cerr << "need '--dest' parameter" << endl;
355  return 1;
356  }
357  if (format == FMT_NONE) {
358  cerr << "need '--format' parameter" << endl;
359  return 1;
360  }
361  if (format == FMT_ARB && !tmpl) {
362  cerr << "need '--arb-template' parameter for output type ARB'" << endl;
363  return 1;
364  }
365 
366  set<string> accessions;
367  if (accs) {
368  if (!GB_is_regularfile(accs)) {
369  cerr << "unable to open accession number file '" << accs << '\'' << endl;
370  return 1;
371  }
372  accessions = read_accession_file(accs);
373  }
374 
375  GB_ERROR error = NULp;
376  int exitcode = 0;
377 
378  GBDATA *gbsrc = GB_open(src, "r");
379  if (!gbsrc) {
380  error = "unable to open source file";
381  exitcode = 1;
382  }
383 
384  if (!error) {
385  GB_transaction trans(gbsrc);
386 
387  int count_max = 0;
388  if (accs) {
389  count_max = accessions.size();
390  } else {
391  for (GBDATA *gbspec = GBT_first_species(gbsrc);
392  gbspec; gbspec = GBT_next_species(gbspec)) {
393  count_max++;
394  }
395  }
396 
397  // create writer for target type
398  Writer *writer = NULp;
399  switch(format) {
400  case FMT_ARB:
401  writer = new ArbWriter(tmpl, dest, count_max);
402  break;
403  case FMT_FASTA: {
404  char *default_ali = GBT_get_default_alignment(gbsrc);
405  writer = new MultiFastaWriter(dest, default_ali, count_max);
406  free(default_ali);
407  break;
408  }
409  case FMT_EFT:
410  writer = new AwtiExportWriter(gbsrc, src, eft, dest, compress);
411  break;
412 
413  case FMT_NONE:
414  error = "internal error";
415  exitcode = 2;
416  break;
417  }
418 
419  if (writer) {
420  // generate file
421  for (GBDATA *gbspec = GBT_first_species(gbsrc);
422  gbspec && writer->ok();
423  gbspec = GBT_next_species(gbspec))
424  {
425  GBDATA *gbname = GB_find(gbspec, "acc", SEARCH_CHILD);
426  string name(GB_read_char_pntr(gbname));
427 
428  if (!accs || accessions.count(name)) {
429  writer->addSequence(gbspec);
430  }
431  GB_release(gbspec);
432  }
433 
434  if (writer->ok()) writer->finish();
435 
436  if (!writer->ok()) {
437  error = GBS_static_string(writer->get_error());
438  exitcode = 5;
439  }
440  delete writer;
441  }
442  }
443  if (gbsrc) GB_close(gbsrc);
444 
445  if (error) cerr << "ERROR: " << error << endl;
446  return exitcode;
447 }
448 
GB_ERROR GB_begin_transaction(GBDATA *gbd)
Definition: arbdb.cxx:2492
GB_ERROR GB_copy_dropProtectMarksAndTempstate(GBDATA *dest, GBDATA *source)
Definition: arbdb.cxx:2120
#define arb_assert(cond)
Definition: arb_assert.h:245
string result
GBDATA * GB_open(const char *path, const char *opent)
Definition: ad_load.cxx:1363
GB_ERROR GB_commit_transaction(GBDATA *gbd)
Definition: arbdb.cxx:2515
virtual bool ok() const =0
string readData(GBDATA *spec)
AliDataPtr format(AliDataPtr data, const size_t wanted_len, GB_ERROR &error)
Definition: insdel.cxx:615
return string(buffer, length)
void addSequence(GBDATA *gbspec) OVERRIDE
GBDATA * GB_find(GBDATA *gbd, const char *key, GB_SEARCH_TYPE gbs)
Definition: adquery.cxx:295
virtual void addSequence(GBDATA *)=0
static void help()
AwtiExportWriter(GBDATA *_gbmain, const char *_dbname, const char *eft, const char *out, int c)
char * GB_read_as_string(GBDATA *gbd)
Definition: arbdb.cxx:1030
static set< string > read_accession_file(const char *file)
ArbWriter(string ta, string f, int c)
bool ok() const
const char * GBS_global_string(const char *templat,...)
Definition: arb_msg.cxx:204
long GBT_get_alignment_len(GBDATA *gb_main, const char *aliname)
Definition: adali.cxx:706
void addSequence(GBDATA *spec) OVERRIDE
Definition: reader.h:95
GB_ERROR GB_release(GBDATA *gbd)
Definition: arbdb.cxx:2564
GB_ERROR GB_await_error()
Definition: arb_msg.cxx:353
GBDATA * GB_create_container(GBDATA *father, const char *key)
Definition: arbdb.cxx:1803
GB_ERROR ARB_format_alignment(GBDATA *Main, const char *alignment_name)
Definition: insdel.cxx:1439
GB_ERROR GB_save_as(GBDATA *gbd, const char *path, const char *savetype)
static void error(const char *msg)
Definition: mkptypes.cxx:96
char * str
Definition: defines.h:20
equals(const T &_t)
GB_ERROR export_by_format(GBDATA *gb_main, ExportWhich which, const char *one_species, AP_filter *filter, int cut_stop_codon, int compress, const char *dbname, const char *formname, const char *field_transfer_set, const char *outname, int multiple, char **real_outname)
Definition: seq_export.cxx:757
virtual ~Writer()
int main(int argc, char **argv)
static void aw_status(double d)
virtual void finish()
string readString(GBDATA *spec, const char *key)
void finish() OVERRIDE
xml element
void GB_write_flag(GBDATA *gbd, long flag)
Definition: arbdb.cxx:2737
#define OVERRIDE
Definition: cxxforward.h:93
const struct formatTable formname[]
GBDATA * GBT_first_species(GBDATA *gb_main)
Definition: aditem.cxx:124
MultiFastaWriter(string s, const char *a, int c)
const char * GBS_static_string(const char *str)
Definition: arb_msg.cxx:213
static int line
Definition: arb_a2ps.c:296
GBDATA * GBT_next_species(GBDATA *gb_species)
Definition: aditem.cxx:128
virtual void set_error(const char *error)
#define NULp
Definition: cxxforward.h:97
bool GB_is_regularfile(const char *path)
Definition: arb_file.cxx:76
const char * get_error() const
char * GBT_get_default_alignment(GBDATA *gb_main)
Definition: adali.cxx:675
Definition: trnsprob.h:20
void GBT_mark_all(GBDATA *gb_main, int flag)
Definition: aditem.cxx:291
GB_ERROR GB_request_undo_type(GBDATA *gb_main, GB_UNDO_TYPE type) __ATTR__USERESULT_TODO
Definition: adindex.cxx:718
GB_transaction ta(gb_var)
GB_CSTR GB_read_char_pntr(GBDATA *gbd)
Definition: arbdb.cxx:874
GBDATA * GB_search(GBDATA *gbd, const char *fieldpath, GB_TYPES create)
Definition: adquery.cxx:531
void GB_close(GBDATA *gbd)
Definition: arbdb.cxx:625
bool operator()(const T &_t)
GB_write_int const char s
Definition: AW_awar.cxx:156