ARB
FilteredExport.cxx
Go to the documentation of this file.
1 // ============================================================ //
2 // //
3 // File : FilteredExport.cxx //
4 // Purpose : encapsulate SAI-filtered fasta exporter //
5 // //
6 // Coded by Ralf Westram (coder@reallysoft.de) in June 2017 //
7 // http://www.arb-home.de/ //
8 // //
9 // ============================================================ //
10 
11 #include "FilteredExport.h"
12 #include <arbdbt.h>
13 #include <gb_aci.h>
14 #include <arb_progress.h>
15 #include <algorithm>
16 
17 AP_filter *FilterDefinition::make_filter(GBDATA *gb_main, const char *aliName, size_t aliSize) const {
23  AP_filter *filter = NULp;
24 
25  {
26  GBDATA *gb_sai = GBT_expect_SAI(gb_main, sai_name.c_str());
27  if (!gb_sai) error = GB_await_error();
28  else {
29  GBDATA *gb_data = GBT_find_sequence(gb_sai, aliName);
30  if (!gb_data) {
31  error = GBS_global_string("SAI '%s' has no data in alignment '%s'", sai_name.c_str(), aliName);
32  }
33  else {
34 #if defined(ASSERTION_USED)
35  long datasize = GB_read_count(gb_data); // may be less than ali-length!
36  arb_assert(datasize == long(aliSize)); // @@@ write a test failing this assertion (BLOCK and PASS need to handle this differently)
37 #endif
38 
39  char *sai_data = GB_read_as_string(gb_data);
40  if (!sai_data) error = GB_await_error();
41  else {
42  bool blockChars = (type == BLOCK) != inverse;
43  CharRangeTable crt(characters.c_str());
44  if (blockChars) {
45  filter = new AP_filter(sai_data, crt.expandedRange(), aliSize); // blocks characters
46  }
47  else {
48  AP_filter inv_filt(sai_data, crt.expandedRange(), aliSize);
49  filter = new AP_filter(NOT, inv_filt);
50  }
51  free(sai_data);
52  }
53  }
54  }
55  }
56 
57  arb_assert(contradicted(filter, error));
58  if (error) GB_export_error(error);
59  return filter;
60 }
61 
62 FilteredExport::FilteredExport(GBDATA *gb_main_, const char *aliname_, size_t alisize_) :
63  gb_main(gb_main_),
64  aliname(nulldup(aliname_)),
65  alisize(alisize_),
66  accept_missing_data(false),
67  header_ACI(strdup("readdb(name)")),
68  sequence_ACI(NULp),
69  count_table(NULp),
70  minCount(0),
71  filter(alisize),
72  filter_added(false)
73 {}
74 
76  free(header_ACI);
77  free(sequence_ACI);
78  free(aliname);
79 }
80 
82  AP_filter *newFilter = filterDef.make_filter(gb_main, aliname, alisize);
83  if (!newFilter) {
84  return GB_await_error();
85  }
86 
87  if (!filter_added) {
88  filter = *newFilter;
89  filter_added = true;
90  }
91  else {
92  switch (filterDef.get_type()) {
93  case PASS: filter = AP_filter(filter, OR, *newFilter); break;
94  case BLOCK: filter = AP_filter(filter, AND, *newFilter); break;
95  }
96  }
97  delete newFilter;
98  return NULp;
99 }
100 
101 int FilteredExport::count_bases(const char *seq) const {
102  int count = 0;
103  for (int p = 0; seq[p]; ++p) {
104  count += count_table.isSet(seq[p]);
105  }
106  return count;
107 }
108 
109 char *FilteredExport::get_filtered_sequence(GBDATA *gb_species, const char*& reason) const {
110  /* returns filtered sequence or
111  * NULp (which means "do not export")
112  * - if an error occurred (error is exported only in that case!),
113  * - if species had no data in alignment (and accept_missing_data is true) or
114  * - if filtered sequence does not have min. required base count.
115  * If NULp returned and no error exported -> 'reason' gets set!
116  */
117  arb_assert(gb_species);
119 
120  reason = NULp;
121 
122  char *seq = NULp;
123  GBDATA *gb_data = GBT_find_sequence(gb_species, aliname);
124 
125  if (!gb_data) {
126  if (GB_have_error()) {
127  GB_export_error(GB_failedTo_error("read sequence of ", GBT_get_name_or_description(gb_species), GB_await_error()));
128  }
129  else {
130  if (accept_missing_data) {
131  reason = "has no data";
132  }
133  else {
134  GB_export_errorf("species '%s' has no data in '%s'", GBT_get_name_or_description(gb_species), aliname);
135  }
136  }
137  }
138  else {
139  GB_ERROR error = filter.is_invalid();
140  if (error) GB_export_error(error);
141  else {
142  const char *cseq = GB_read_char_pntr(gb_data);
143  seq = filter.filter_string(cseq);
144 
145  // check min. requirements:
146  if (minCount>0) { // otherwise check would always succeed
147  int count = count_bases(seq);
148  if (count<minCount) { // too few bases -> do not export
149  freenull(seq);
150  reason = "not enough base-characters left";
151  }
152  }
153  }
154  }
155 
156  if (seq && sequence_ACI) {
157  char *seq_postprocessed = GB_command_interpreter_in_env(seq, sequence_ACI, GBL_simple_call_env(gb_species));
158  if (seq_postprocessed) {
159  freeset(seq, seq_postprocessed);
160  }
161  else {
162  char *error = strdup(GB_await_error());
163  GB_export_errorf("Failed to post-process sequence data of species '%s' (Reason: %s)", GBT_get_name_or_description(gb_species), error);
164  free(error);
165  freenull(seq);
166  }
167  }
168 
169  arb_assert(contradicted(seq, GB_have_error() || reason));
170  arb_assert(implicated(!seq, contradicted(GB_have_error(), reason)));
171 
172  return seq;
173 }
174 
175 char *FilteredExport::get_fasta_header(GBDATA *gb_species) const {
176  return GB_command_interpreter_in_env("", header_ACI, GBL_simple_call_env(gb_species));
177 }
178 
180  GB_ERROR error = NULp;
181  int exported = 0;
182  int skipped = 0;
183 
184  {
185  arb_progress progress("Write sequence data", GBT_get_species_count(gb_main));
186 
187  for (GBDATA *gb_species = GBT_first_species(gb_main);
188  gb_species && !error;
189  gb_species = GBT_next_species(gb_species))
190  {
191  const char *reason;
192  char *filt_seq = get_filtered_sequence(gb_species, reason);
193 
194  if (filt_seq) {
195  ++exported; // count exported species
196  fputc('>', out);
197  {
198  char *header = get_fasta_header(gb_species);
199  if (header) {
200  fputs(header, out);
201  free(header);
202  }
203  else {
204  error = GB_await_error();
205  }
206  }
207  fputc('\n', out);
208 
209  fputs(filt_seq, out);
210  fputc('\n', out);
211 
212  free(filt_seq);
213  }
214  else {
215  if (reason) {
216  ++skipped; // count skipped
217  fprintf(stderr, "Skipped species '%s' (Reason: %s)\n", GBT_get_name_or_description(gb_species), reason);
218  }
219  else {
220  error = GB_await_error();
221  }
222  }
223  ++progress;
224  }
225 
226  if (error) progress.done();
227  }
228 
229  if (!error) {
230  if (exported) {
231  fprintf(stderr, "Summary: %i species exported", exported);
232  if (skipped) fprintf(stderr, ", %i species skipped", skipped);
233  fputc('\n', stderr);
234  }
235  else {
236  fprintf(stderr, "Summary: all %i species skipped (warning: generated empty file)\n", skipped);
237  }
238  }
239 
240  return error;
241 }
242 
243 // --------------------------------------------------------------------------------
244 
245 #ifdef UNIT_TESTS
246 #ifndef TEST_UNIT_H
247 #include <test_unit.h>
248 #endif
249 
250 #define TEST_EXPECT_CRT_DEFINES(arg,expected) TEST_EXPECT_EQUAL(CharRangeTable(arg).expandedRange(), expected)
251 
252 #define ABC "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
253 #define abc "abcdefghijklmnopqrstuvwxyz"
254 
255 void TEST_CharRangeTable() {
256  TEST_EXPECT_CRT_DEFINES("abc", "abc");
257  TEST_EXPECT_CRT_DEFINES("cba", "abc");
258  TEST_EXPECT_CRT_DEFINES("c-a", "-ac"); // unwanted reverse range does not expand!
259  TEST_EXPECT_CRT_DEFINES("a-c", "abc");
260 
261  TEST_EXPECT_CRT_DEFINES("a-db-e", "abcde");
262  TEST_EXPECT_CRT_DEFINES("a-de-b", "-abcde");
263 
264  TEST_EXPECT_CRT_DEFINES("-ab", "-ab");
265  TEST_EXPECT_CRT_DEFINES("a-b", "ab");
266  TEST_EXPECT_CRT_DEFINES("ab-", "-ab");
267 
268  TEST_EXPECT_CRT_DEFINES("a-ac-c", "ac");
269 
270  TEST_EXPECT_CRT_DEFINES("a-zA-Z", ABC abc);
271 
272  // dangerous ranges are not expanded:
273  TEST_EXPECT_CRT_DEFINES(".-=", "-.=");
274  TEST_EXPECT_CRT_DEFINES("=-.", "-.=");
275 
276  TEST_EXPECT_CRT_DEFINES("a-Z", "-Za");
277  TEST_EXPECT_CRT_DEFINES("A-z", "-Az");
278 }
279 
280 #undef abc
281 #undef ABC
282 
283 #define TEST_EXPECT_SEQWITHLENGTH__NOERROREXPORTED(create_seqcopy, expected_length) do { \
284  char *seqcopy; \
285  TEST_EXPECT_RESULT__NOERROREXPORTED(seqcopy = create_seqcopy); \
286  TEST_EXPECT_EQUAL(strlen(seqcopy), expected_length); \
287  free(seqcopy); \
288  } while(0)
289 
290 void TEST_FilteredExport() {
291  // see also ../../TOOLS/arb_test.cxx@SAI_FILTERED_EXPORT_TESTS
292 
293  GB_shell shell;
294  GBDATA *gb_main = GB_open("TEST_prot_tiny.arb", "r"); // ../../UNIT_TESTER/run/TEST_prot_tiny.arb
295 
296  // only "CytLyti6" has data in 'ali_dna_incomplete'
297 
298  {
299  char *ali_name = NULp;
300  size_t ali_size = 0;
301 
302  {
303  GB_transaction ta(gb_main);
304  ali_name = GBT_get_default_alignment(gb_main);
305  ali_size = GBT_get_alignment_len(gb_main, ali_name);
306  }
307 
308  {
309  FilteredExport exporter(gb_main, ali_name, ali_size);
310  FilteredExport exporter_incomplete(gb_main, "ali_dna_incomplete", ali_size);
311 
312  {
313  GB_transaction ta(gb_main);
314 
315  GBDATA *gb_spec1 = GBT_find_species(gb_main, "StrRamo3"); // ~ 60 AA
316  GBDATA *gb_spec2 = GBT_find_species(gb_main, "MucRacem"); // more AA
317  GBDATA *gb_spec3 = GBT_find_species(gb_main, "BctFra12");
318  GBDATA *gb_spec4 = GBT_find_species(gb_main, "CytLyti6");
319 
320  TEST_REJECT_NULL(gb_spec1);
321  TEST_REJECT_NULL(gb_spec2);
322  TEST_REJECT_NULL(gb_spec3);
323  TEST_REJECT_NULL(gb_spec4);
324 
325  const char *reason;
326  TEST_EXPECT_SEQWITHLENGTH__NOERROREXPORTED(exporter.get_filtered_sequence(gb_spec1, reason), ali_size);
327  TEST_EXPECT_SEQWITHLENGTH__NOERROREXPORTED(exporter.get_filtered_sequence(gb_spec2, reason), ali_size);
328 
329  TEST_EXPECT_NORESULT__ERROREXPORTED_CONTAINS(exporter_incomplete.get_filtered_sequence(gb_spec3, reason), "has no data in 'ali_dna_incomplete'");
330  TEST_EXPECT_NULL(reason);
331 
332  exporter_incomplete.do_accept_missing_data(); // changes state of exporter_incomplete!
333 
334  TEST_EXPECT_NORESULT__NOERROREXPORTED(exporter_incomplete.get_filtered_sequence(gb_spec3, reason));
335  TEST_EXPECT_EQUAL(reason, "has no data");
336 
337  TEST_EXPECT_SEQWITHLENGTH__NOERROREXPORTED(exporter.get_filtered_sequence(gb_spec4, reason), ali_size);
338 
339  // test header-generation:
340  TEST_EXPECT_EQUAL_STRINGCOPY__NOERROREXPORTED(exporter.get_fasta_header(gb_spec1), "StrRamo3");
341  TEST_EXPECT_EQUAL_STRINGCOPY__NOERROREXPORTED(exporter.get_fasta_header(gb_spec2), "MucRacem");
342 
343  exporter.set_header_ACI("readdb(name);\",\";readdb(full_name)"); // use real ACI for header
344  TEST_EXPECT_EQUAL_STRINGCOPY__NOERROREXPORTED(exporter.get_fasta_header(gb_spec3), "BctFra12,Bacteroides fragilis");
345 
346  exporter.set_header_ACI("readdb(name);\",\";readdb(ali_dna);\",\";readdb(nosuchfield);\",\";readdb(full_name)"); // wrong accepted use (try to read a container and a unknown field)
347  TEST_EXPECT_EQUAL_STRINGCOPY__NOERROREXPORTED(exporter.get_fasta_header(gb_spec4), "CytLyti6,,,Cytophaga lytica"); // both produce empty output
348 
349  exporter.set_header_ACI("readdb(name);\",\";bugme"); // wrong rejected use (unknown command)
350  TEST_EXPECT_NORESULT__ERROREXPORTED_CONTAINS(exporter.get_fasta_header(gb_spec4), "Unknown command 'bugme'"); // aborts with error
351 
352  // test sequences are skipped if to few bases remain:
353  exporter.set_required_baseCount("ACGT", 185); // (bit more than 3*60)
354  TEST_EXPECT_NORESULT__NOERROREXPORTED(exporter.get_filtered_sequence(gb_spec1, reason));
355  TEST_EXPECT_EQUAL(reason, "not enough base-characters left");
356  exporter.reset_required_baseCount();
357 
358  // test FilterDefinition
359  FilterDefinition pvp_blck_variab("POS_VAR_BY_PARSIMONY", BLOCK, true, "-=.0123");
360  FilterDefinition pvp_pass_variab("POS_VAR_BY_PARSIMONY", PASS, true, "-=.0123");
361  FilterDefinition pvp_blck_cnsrvd("POS_VAR_BY_PARSIMONY", BLOCK, false, "-=.012345");
362  FilterDefinition pvp_pass_cnsrvd("POS_VAR_BY_PARSIMONY", PASS , false, "-=.012345");
363 
364  {
365  AP_filter *filt_blck_variab;
366  AP_filter *filt_pass_variab;
367  AP_filter *filt_blck_cnsrvd;
368  AP_filter *filt_pass_cnsrvd;
369 
370  TEST_EXPECT_RESULT__NOERROREXPORTED(filt_blck_variab = pvp_blck_variab.make_filter(gb_main, ali_name, ali_size));
371  TEST_EXPECT_RESULT__NOERROREXPORTED(filt_pass_variab = pvp_pass_variab.make_filter(gb_main, ali_name, ali_size));
372  TEST_EXPECT_RESULT__NOERROREXPORTED(filt_blck_cnsrvd = pvp_blck_cnsrvd.make_filter(gb_main, ali_name, ali_size));
373  TEST_EXPECT_RESULT__NOERROREXPORTED(filt_pass_cnsrvd = pvp_pass_cnsrvd.make_filter(gb_main, ali_name, ali_size));
374 
375  TEST_EXPECT_EQUAL(filt_blck_variab->get_length(), ali_size);
376  TEST_EXPECT_EQUAL(filt_pass_variab->get_length(), ali_size);
377  TEST_EXPECT_EQUAL(filt_blck_cnsrvd->get_length(), ali_size);
378  TEST_EXPECT_EQUAL(filt_pass_cnsrvd->get_length(), ali_size);
379 
380  TEST_EXPECT_EQUAL(filt_blck_variab->get_filtered_length(), 135);
381  TEST_EXPECT_EQUAL(filt_pass_variab->get_filtered_length(), ali_size-135);
382  TEST_EXPECT_EQUAL(filt_blck_cnsrvd->get_filtered_length(), ali_size-45);
383  TEST_EXPECT_EQUAL(filt_pass_cnsrvd->get_filtered_length(), 45);
384 
385  delete filt_pass_cnsrvd;
386  delete filt_blck_cnsrvd;
387  delete filt_pass_variab;
388  delete filt_blck_variab;
389  }
390 
391  // test get_filtered_sequence with real filters:
392  TEST_EXPECT_NO_ERROR(exporter.add_SAI_filter(pvp_blck_variab));
393  TEST_EXPECT_EQUAL_STRINGCOPY__NOERROREXPORTED(exporter.get_filtered_sequence(gb_spec1, reason), "TCCACGCACCAAGTCCT--GGACTTTG--GACCGGGTCCCTGACG-ATG-CCGGGGACG---GACTGG--TC--GGGCCGCT-ACGG---CGCTCGGCCGCG-------GCCG-CTGGG----CCGCCGT.....");
394 
395  exporter.clear_SAI_filters();
396 
397  TEST_EXPECT_NO_ERROR(exporter.add_SAI_filter(pvp_pass_cnsrvd));
398  TEST_EXPECT_EQUAL_STRINGCOPY__NOERROREXPORTED(exporter.get_filtered_sequence(gb_spec1, reason), "TCACCAAGTTGGGTCGACCGGGAATGGGCCAGCCCGCGCTGGCCC");
399 
400  exporter.clear_SAI_filters();
401 
402  TEST_EXPECT_NO_ERROR(exporter.add_SAI_filter(pvp_blck_cnsrvd));
403  TEST_EXPECT_NO_ERROR(exporter.add_SAI_filter(pvp_blck_variab));
404  TEST_EXPECT_EQUAL_STRINGCOPY__NOERROREXPORTED(exporter.get_filtered_sequence(gb_spec1, reason), "CGCACCC--ACTTTG--GACCGGCCTCG-ATG-GCG---GCG--TC--GCGT-CG---CGTCGG-------GCG-CG----GCGT.....");
405 
406  // test sequence post-processing
407  exporter.set_sequence_ACI(":.=-"); // convert dots to hyphens (using SRT)
408  TEST_EXPECT_EQUAL_STRINGCOPY__NOERROREXPORTED(exporter.get_filtered_sequence(gb_spec1, reason), "CGCACCC--ACTTTG--GACCGGCCTCG-ATG-GCG---GCG--TC--GCGT-CG---CGTCGG-------GCG-CG----GCGT-----");
409 
410  exporter.set_sequence_ACI(":bad"); // malformed expression
411  TEST_EXPECT_NORESULT__ERROREXPORTED_CONTAINS(exporter.get_filtered_sequence(gb_spec1, reason), "SRT ERROR: no '=' found in command 'bad'");
412  }
413  }
414  free(ali_name);
415  }
416  GB_close(gb_main);
417 }
418 
419 #endif // UNIT_TESTS
420 
421 // --------------------------------------------------------------------------------
422 
423 
#define arb_assert(cond)
Definition: arb_assert.h:245
const char * GB_ERROR
Definition: arb_core.h:25
GBDATA * GB_open(const char *path, const char *opent)
Definition: ad_load.cxx:1363
#define implicated(hypothesis, conclusion)
Definition: arb_assert.h:289
Definition: AP_filter.hxx:36
char * GB_read_as_string(GBDATA *gbd)
Definition: arbdb.cxx:1030
const char * GBS_global_string(const char *templat,...)
Definition: arb_msg.cxx:204
long GBT_get_alignment_len(GBDATA *gb_main, const char *aliname)
Definition: adali.cxx:706
bool GB_have_error()
Definition: arb_msg.cxx:349
GBDATA * GBT_expect_SAI(GBDATA *gb_main, const char *name)
Definition: aditem.cxx:180
AP_filter * make_filter(GBDATA *gb_main, const char *aliName, size_t aliSize) const
FILE * seq
Definition: rns.c:46
GB_ERROR write_fasta(FILE *out)
FilteredExport(GBDATA *gb_main_, const char *aliname_, size_t alisize_)
GB_ERROR GB_export_error(const char *error)
Definition: arb_msg.cxx:259
GB_ERROR GB_await_error()
Definition: arb_msg.cxx:353
long GB_read_count(GBDATA *gbd)
Definition: arbdb.cxx:728
char * filter_string(const char *fulllen_string) const
Definition: AP_filter.cxx:239
#define false
Definition: ureadseq.h:13
#define TEST_REJECT_NULL(n)
Definition: test_unit.h:1309
static void error(const char *msg)
Definition: mkptypes.cxx:96
fputc('\n', stderr)
size_t get_length() const
Definition: AP_filter.hxx:83
FilterDefType get_type() const
bool isSet(uint8_t i) const
GBDATA * GBT_find_sequence(GBDATA *gb_species, const char *aliname)
Definition: adali.cxx:670
size_t get_filtered_length() const
Definition: AP_filter.hxx:82
fputs(TRACE_PREFIX, stderr)
GB_ERROR GB_export_errorf(const char *templat,...)
Definition: arb_msg.cxx:264
#define TEST_EXPECT_NULL(n)
Definition: test_unit.h:1307
GB_ERROR GB_failedTo_error(const char *do_something, const char *special, GB_ERROR error)
Definition: arb_msg.cxx:387
GBDATA * GBT_first_species(GBDATA *gb_main)
Definition: aditem.cxx:124
GB_ERROR is_invalid() const
Definition: AP_filter.hxx:123
#define TEST_EXPECT_NO_ERROR(call)
Definition: test_unit.h:1107
GBDATA * GBT_next_species(GBDATA *gb_species)
Definition: aditem.cxx:128
#define NULp
Definition: cxxforward.h:97
GBDATA * GBT_find_species(GBDATA *gb_main, const char *name)
Definition: aditem.cxx:136
char * GBT_get_default_alignment(GBDATA *gb_main)
Definition: adali.cxx:675
NOT4PERL char * GB_command_interpreter_in_env(const char *str, const char *commands, const GBL_call_env &callEnv)
Definition: gb_aci.cxx:361
GB_ERROR add_SAI_filter(const FilterDefinition &filterDef) __ATTR__USERESULT
long GBT_get_species_count(GBDATA *gb_main)
Definition: aditem.cxx:203
GB_transaction ta(gb_var)
GB_CSTR GB_read_char_pntr(GBDATA *gbd)
Definition: arbdb.cxx:874
GBDATA * gb_main
Definition: adname.cxx:33
GB_CSTR GBT_get_name_or_description(GBDATA *gb_item)
Definition: aditem.cxx:437
#define TEST_EXPECT_EQUAL(expr, want)
Definition: test_unit.h:1283
void GB_close(GBDATA *gbd)
Definition: arbdb.cxx:625