ARB
arb_export_seq_filtered.cxx
Go to the documentation of this file.
1 // ============================================================ //
2 // //
3 // File : arb_export_seq_filtered.cxx //
4 // Purpose : export sequences filtered by SAI //
5 // //
6 // Coded by Ralf Westram (coder@reallysoft.de) in June 2017 //
7 // http://www.arb-home.de/ //
8 // //
9 // ============================================================ //
10 
11 #include <FilteredExport.h>
12 #include <arbdbt.h>
13 #include <arb_file.h>
14 #include <string>
15 #include <list>
16 #include <cstdlib>
17 
18 using namespace std;
19 
20 class CLI : virtual Noncopyable {
21  bool helpWanted;
23 
24  string database; // name of input database
25  string fasta; // name of FASTA output file
26  string aliname; // name of alignment to use
27  string head_aci; // ACI used to create content of fasta header
28  string seq_aci; // ACI used to post-process sequence data
29 
30  bool accept_missing_data; // true -> skip sequences with less bases in data
31 
32  int min_bases; // skip sequences with less than 'min_bases' base-characters in data
33  string bases; // defines what counts as base-character
34 
35  string filterby; // name of last SAI specified with "--filterby"
36 
37  typedef SmartPtr<FilterDefinition> FilterDefinitionPtr;
38  typedef std::list<FilterDefinitionPtr> FilterDefinitionList;
39 
40  FilterDefinitionList def_filters;
41 
42  static inline const char *getarg(int& argc, const char**& argv) {
43  return argc>0 ? (--argc,*argv++) : NULp;
44  }
45  inline const char *expect_arg(int& argc, const char**& argv) {
46  const char *arg = getarg(argc, argv);
47  if (!arg) {
48  error = "expected argument missing";
49  arg = "";
50  }
51  return arg;
52  }
53  void show_help() const {
54  fputs("\n"
55  "arb_export_seq_filtered -- export sequences filtered by SAI\n"
56  "Usage: arb_export_seq_filtered [switches]\n"
57  "\n"
58  "general:\n"
59  " <chars> defines a set of characters (for blocking/passthrough).\n"
60  " It may contain alphanumeric ranges, e.g. '0-5' which is equiv. to '012345'.\n"
61  " To specify a literal '-' put it at start or end of <chars>.\n"
62  "\n"
63  "switches:\n"
64  "--db <dbname> ARB database to export from\n"
65  "--fasta <outname> name of generated fasta file\n"
66  "\n"
67  "--id <aci> specify content for fasta header using ACI\n"
68  " (default: \"readdb(name)\"; see http://help.arb-home.de/aci.html)\n"
69  "\n"
70  "--ali <aliname> name of alignment to use (default: use default alignment)\n"
71  "--accept-missing-data silently skip species w/o data (default: abort with error)\n"
72  "--seqpp <aci> specify ACI to postprocess sequence data (default: none; example dot->hyphen: \":.=-\")\n"
73  "\n"
74  "--filterby <SAI> filter sequences using <SAI> (SAI has to be stored in database)\n"
75  "--block [allbut] <chars> exclude columns where <SAI> contains any of <chars>\n"
76  "--pass [allbut] <chars> include columns where <SAI> contains any of <chars>\n"
77  " (specifying 'allbut' will invert the set of <chars>)\n"
78  "\n"
79  "--min-bases <count> min. base count of exported data (after filtering).\n"
80  " Export of species is skipped if not reached and\n"
81  " a diagnostic message gets written to stderr.\n"
82  "--count-bases <chars> defines base-characters to count (default: \"a-zA-Z\")\n"
83  "\n"
84  ,stderr);
85  }
86 
87  const char *missing_passBlock() { return "--filterby has to be followed by --pass or --block"; }
88  const char *missing_filterby() { return "--pass and --block have to be preceeded by --filterby"; }
89 
90  void store_filterdef(FilterDefinition *def) {
91  if (def) def_filters.push_back(def);
92  }
93  FilterDefinition *expect_filterdef_args(FilterDefType type, int& argc, const char**& argv) {
95  string arg = expect_arg(argc, argv);
96  if (!error) {
97  bool filter_chars = true;
98 
99  if (arg == "allbut") {
100  filter_chars = false;
101  arg = expect_arg(argc, argv);
102  }
103 
104  if (!error) {
105  if (filterby.empty()) {
106  error = missing_filterby();
107  }
108  else {
109  result = new FilterDefinition(filterby.c_str(), type, filter_chars, arg.c_str());
110  }
111  }
112  }
113  filterby = ""; // forget last filterby (even in error-case -> avoids some weird error-messages)
114  return result;
115  }
116 
117  void parse(int& argc, const char**& argv) {
118  const char *arg = getarg(argc, argv);
119  if (arg) {
120  if (strcmp(arg, "--db") == 0) database = expect_arg(argc, argv);
121  else if (strcmp(arg, "--fasta") == 0) fasta = expect_arg(argc, argv);
122  else if (strcmp(arg, "--id") == 0) head_aci = expect_arg(argc, argv);
123  else if (strcmp(arg, "--seqpp") == 0) seq_aci = expect_arg(argc, argv);
124  else if (strcmp(arg, "--ali") == 0) aliname = expect_arg(argc, argv);
125  else if (strcmp(arg, "--count-bases") == 0) bases = expect_arg(argc, argv);
126 
127  else if (strcmp(arg, "--filterby") == 0) {
128  if (!filterby.empty()) error = missing_passBlock();
129  else filterby = expect_arg(argc, argv);
130  }
131 
132  else if (strcmp(arg, "--block") == 0) store_filterdef(expect_filterdef_args(BLOCK, argc, argv));
133  else if (strcmp(arg, "--pass") == 0) store_filterdef(expect_filterdef_args(PASS, argc, argv));
134 
135  else if (strcmp(arg, "--accept-missing-data") == 0) accept_missing_data = true;
136 
137  else if (strcmp(arg, "--min-bases") == 0) min_bases = atoi(expect_arg(argc, argv));
138 
139  else if (strcmp(arg, "--help") == 0 || strcmp(arg, "-h") == 0) helpWanted = true;
140 
141  else {
142  error = GBS_global_string("unexpected argument '%s'", arg);
143  }
144  }
145  }
146  void check_required_arguments() {
147  if (database.empty()) error = "no input database specified";
148  else if (fasta.empty()) error = "no output file specified";
149  }
150 
151 public:
152  CLI(int argc, const char **argv) :
153  helpWanted(false),
154  error(NULp),
155  accept_missing_data(false),
156  min_bases(0),
157  bases("a-zA-Z")
158  {
159  --argc; ++argv;
160  while (!error && argc>0 && !helpWanted) {
161  parse(argc, argv);
162  }
163 
164  if (!helpWanted) { // do not check extended conditions, if '--help' seen
165  if (!error && !filterby.empty()) {
166  error = missing_passBlock();
167  }
168  if (!error) {
169  check_required_arguments();
170  if (error) helpWanted = true;
171  }
172  }
173  }
174 
175  bool help_wanted() const { return helpWanted; }
176  void show_help_if_useful() const { if (helpWanted) show_help(); }
177  GB_ERROR get_error() const { return error; }
178 
179  const char *get_database() const { return database.c_str(); }
180  const char *get_fasta() const { return fasta.c_str(); }
181  const char *get_aliname() const { return aliname.c_str(); }
182  const char *get_head_aci() const { return head_aci.c_str(); }
183  const char *get_seq_aci() const { return seq_aci.c_str(); }
184  const char *get_basechars() const { return bases.c_str(); }
185 
186  bool shall_accept_missing_data() const { return accept_missing_data; }
187 
188  int get_min_bases_count() const { return min_bases; }
189 
190  bool has_filters() const { return !def_filters.empty(); }
192  arb_assert(!error);
193  GB_ERROR err = NULp;
194  for (FilterDefinitionList::const_iterator d = def_filters.begin(); !err && d != def_filters.end(); ++d) {
195  err = exporter.add_SAI_filter(**d);
196  }
197  return err;
198  }
199 };
200 
201 static GB_ERROR export_seq_data(const CLI& args) {
202  GB_ERROR error = NULp;
203 
204  const char *dbname = args.get_database();
205  GB_shell shell;
206  GBDATA *gb_main = GB_open(dbname, "r");
207 
208  if (!gb_main) {
209  error = GB_await_error();
210  }
211  else {
212  {
213  GB_transaction ta(gb_main);
214 
215  char *aliname = strdup(args.get_aliname());
216  if (aliname[0]) {
217  if (!GBT_get_alignment(gb_main, aliname)) {
218  error = GB_await_error();
219  }
220  }
221  else {
222  freeset(aliname, GBT_get_default_alignment(gb_main));
223  if (!aliname || !aliname[0]) {
224  GB_clear_error();
225  error = "no default alignment defined (use --ali to specify alignment name)";
226  }
227  else {
228  fprintf(stderr, "Using default alignment '%s'\n", aliname);
229  }
230  }
231 
232  if (!error) {
233  arb_assert(aliname && aliname[0]);
234 
235  long alisize = GBT_get_alignment_len(gb_main, aliname);
236  if (alisize<=0) {
237  error = GB_await_error();
238  }
239  else {
240  const char *outname = args.get_fasta();
241  FILE *out = fopen(outname, "wt");
242 
243  if (!out) {
244  error = GB_IO_error("writing", outname);
245  }
246  else {
247  FilteredExport exporter(gb_main, aliname, alisize);
248 
249  if (args.get_head_aci()[0]) {
250  exporter.set_header_ACI(args.get_head_aci());
251  }
252  if (args.shall_accept_missing_data()) {
253  exporter.do_accept_missing_data();
254  }
255  if (args.get_min_bases_count()>0) {
257  }
258  if (args.get_seq_aci()[0]) {
259  exporter.set_sequence_ACI(args.get_seq_aci());
260  }
261  if (args.has_filters()) {
262  arb_assert(!error);
263  error = args.add_filters_to(exporter);
264  }
265 
266  if (!error) {
267  error = exporter.write_fasta(out);
268  }
269  fclose(out);
270 
271  if (error) GB_unlink_or_warn(outname, NULp);
272  }
273  }
274  }
275  free(aliname);
276  }
277  GB_close(gb_main);
278  }
279 
280  return error;
281 }
282 
283 int main(int argc, char **argv) {
284  GB_ERROR error = NULp;
285  CLI args(argc, const_cast<const char**>(argv));
286 
287  if (!error) error = args.get_error();
288 
289  if (!error && args.help_wanted()) {
290  args.show_help_if_useful();
291  return EXIT_FAILURE;
292  }
293 
294  if (!error) error = export_seq_data(args);
295 
296  if (error) {
297  args.show_help_if_useful();
298  fprintf(stderr, "Error in arb_export_seq_filtered: %s\n", error);
299  return EXIT_FAILURE;
300  }
301  return EXIT_SUCCESS;
302 }
303 
#define arb_assert(cond)
Definition: arb_assert.h:245
int get_min_bases_count() const
string result
GBDATA * GB_open(const char *path, const char *opent)
Definition: ad_load.cxx:1363
GB_TYPES type
static void show_help()
Definition: db_server.cxx:242
FilterDefType
const char * get_fasta() const
const char * get_basechars() const
void GB_unlink_or_warn(const char *path, GB_ERROR *error)
Definition: arb_file.cxx:206
const char * get_head_aci() const
GBDATA * GBT_get_alignment(GBDATA *gb_main, const char *aliname)
Definition: adali.cxx:808
GB_ERROR GB_IO_error(const char *action, const char *filename)
Definition: arb_msg.cxx:285
const char * GBS_global_string(const char *templat,...)
Definition: arb_msg.cxx:203
long GBT_get_alignment_len(GBDATA *gb_main, const char *aliname)
Definition: adali.cxx:833
STL namespace.
#define EXIT_SUCCESS
Definition: arb_a2ps.c:154
void set_required_baseCount(const char *basesToCount, int minCount_)
static GB_ERROR export_seq_data(const CLI &args)
GB_ERROR write_fasta(FILE *out)
GB_ERROR GB_await_error()
Definition: arb_msg.cxx:342
GB_ERROR add_filters_to(FilteredExport &exporter) const
void do_accept_missing_data()
Generic smart pointer.
Definition: smartptr.h:149
void GB_clear_error()
Definition: arb_msg.cxx:354
#define false
Definition: ureadseq.h:13
void set_header_ACI(const char *aci)
static void error(const char *msg)
Definition: mkptypes.cxx:96
const char * get_database() const
#define EXIT_FAILURE
Definition: arb_a2ps.c:157
const char * get_aliname() const
void show_help_if_useful() const
fputs(TRACE_PREFIX, stderr)
CLI(int argc, const char **argv)
int main(int argc, char **argv)
bool help_wanted() const
#define NULp
Definition: cxxforward.h:116
const char * get_seq_aci() const
void set_sequence_ACI(const char *aci)
char * GBT_get_default_alignment(GBDATA *gb_main)
Definition: adali.cxx:747
GB_ERROR add_SAI_filter(const FilterDefinition &filterDef) __ATTR__USERESULT
GB_transaction ta(gb_var)
GBDATA * gb_main
Definition: adname.cxx:32
bool shall_accept_missing_data() const
bool has_filters() const
GB_ERROR get_error() const
void GB_close(GBDATA *gbd)
Definition: arbdb.cxx:655