ARB
FilteredExport.h
Go to the documentation of this file.
1 // ============================================================ //
2 // //
3 // File : FilteredExport.h //
4 // Purpose : encapsulate SAI-filtered fasta exporter //
5 // //
6 // Coded by Ralf Westram (coder@reallysoft.de) in June 2017 //
7 // http://www.arb-home.de/ //
8 // //
9 // ============================================================ //
10 
11 #ifndef FILTEREDEXPORT_H
12 #define FILTEREDEXPORT_H
13 
14 #ifndef AP_FILTER_HXX
15 #include <AP_filter.hxx>
16 #endif
17 #ifndef _GLIBCXX_STRING
18 #include <string>
19 #endif
20 #ifndef _STDINT_H
21 #include <stdint.h>
22 #endif
23 
24 typedef enum { BLOCK, PASS } FilterDefType;
25 
27  // easy translation of character ranges to table (e.g. "a-zA-Z").
28  // '-' at start or end of argument string gets accepted as plain char!
29  // Does only expand alphanumeric, forward ranges.
30 
31  bool table[256];
32 public:
33  CharRangeTable(const char *chars) {
34  memset(table, 0, 256);
35  if (chars) {
36  uint8_t prevchar = 0;
37  for (int i = 0; chars[i]; ++i) {
38  uint8_t c = chars[i];
39  if (c == '-' && prevchar) {
40  uint8_t toChar = chars[i+1];
41  if (toChar) {
42  ++i;
43 
44  bool expand = prevchar<=toChar; // only expand forward ranges
45  if (expand) {
46  for (unsigned b = prevchar; b<=toChar && expand; ++b) {
47  if (!isalnum(b)) expand = false; // non-alphanumeric character expanded (dangerous)
48  }
49  }
50 
51  if (expand) {
52  for (unsigned b = prevchar; b<=toChar; ++b) {
53  table[b] = 1;
54  }
55  }
56  else { // do not expand -> insert litarally
57  table[prevchar] = 1;
58  table['-'] = 1;
59  table[toChar] = 1;
60  }
61  }
62  else {
63  table[c] = 1; // '-' at end
64  }
65  }
66  else {
67  table[c] = 1;
68  }
69  prevchar = c;
70  }
71  }
72  }
73 
74  bool isSet(uint8_t i) const { return table[i]; }
75  const char *expandedRange() const {
76  static char buf[256+1];
77  int b = 0;
78  for (unsigned i = 0; i<256; ++i) {
79  if (isSet(i)) {
80  buf[b++] = char(i);
81  }
82  }
83  buf[b] = 0;
84  return buf;
85  }
86 };
87 
89  FilterDefType type;
90 
91  std::string sai_name;
92  std::string characters; // type == BLOCK -> blocking characters; type==PASS -> permeable characters
93 
94  bool inverse; // true -> do not use 'characters', use rest of ASCII set
95 
96 public:
97  FilterDefinition(const char *sai_name_, FilterDefType type_, bool filter_chars, const char *characters_) :
98  type(type_),
99  sai_name(sai_name_),
100  characters(characters_),
101  inverse(!filter_chars)
102  {}
103 
104  FilterDefType get_type() const { return type; }
105  AP_filter *make_filter(GBDATA *gb_main, const char *aliName, size_t aliSize) const;
106 };
107 
108 
109 class FilteredExport : virtual Noncopyable {
110  GBDATA *gb_main;
111  char *aliname;
112  size_t alisize;
113 
114  bool accept_missing_data;
115 
116  char *header_ACI;
117  char *sequence_ACI;
118 
119  // min requirements for export (which chars to count + min. counts required)
120  CharRangeTable count_table;
121  int minCount;
122 
123 
124  AP_filter filter;
125  bool filter_added; // add_SAI_filter called yet?
126 
127  char *get_filtered_sequence(GBDATA *gb_species, const char*& reason) const;
128  char *get_fasta_header(GBDATA *gb_species) const; // w/o leading '>'
129 
130 #if defined(UNIT_TESTS)
131  friend void TEST_FilteredExport(); // allow test inspection
132 #endif
133 
134  int count_bases(const char *seq) const;
135 
136 public:
137  FilteredExport(GBDATA *gb_main_, const char *aliname_, size_t alisize_);
138  ~FilteredExport();
139 
140  // configuration:
141  void do_accept_missing_data() { accept_missing_data = true; }
142  void set_required_baseCount(const char *basesToCount, int minCount_) {
143  minCount = minCount_;
144  count_table = CharRangeTable(basesToCount);
145  arb_assert(implicated(minCount>0, basesToCount));
146  }
148  void set_header_ACI(const char *aci) { freedup(header_ACI, aci); }
149  void set_sequence_ACI(const char *aci) { freedup(sequence_ACI, aci); }
152  filter = AP_filter(alisize);
153  filter_added = false;
154  }
155 
156  // access:
157  const char *get_aliname() const {
158  return aliname;
159  }
160 
161  // action:
162  GB_ERROR write_fasta(FILE *out);
163 };
164 
165 
166 #else
167 #error FilteredExport.h included twice
168 #endif // FILTEREDEXPORT_H
#define arb_assert(cond)
Definition: arb_assert.h:245
const char * GB_ERROR
Definition: arb_core.h:25
FilterDefType
#define implicated(hypothesis, conclusion)
Definition: arb_assert.h:289
return string(buffer, length)
AP_filter * make_filter(GBDATA *gb_main, const char *aliName, size_t aliSize) const
void set_required_baseCount(const char *basesToCount, int minCount_)
FILE * seq
Definition: rns.c:46
GB_ERROR write_fasta(FILE *out)
FilteredExport(GBDATA *gb_main_, const char *aliname_, size_t alisize_)
void reset_required_baseCount()
void do_accept_missing_data()
int chars
Definition: seq_search.cxx:38
void set_header_ACI(const char *aci)
void clear_SAI_filters()
FilterDefType get_type() const
bool isSet(uint8_t i) const
const char * get_aliname() const
CharRangeTable(const char *chars)
const char * expandedRange() const
#define __ATTR__USERESULT
Definition: attributes.h:58
FilterDefinition(const char *sai_name_, FilterDefType type_, bool filter_chars, const char *characters_)
#define NULp
Definition: cxxforward.h:116
void set_sequence_ACI(const char *aci)
GB_ERROR add_SAI_filter(const FilterDefinition &filterDef) __ATTR__USERESULT
GBDATA * gb_main
Definition: adname.cxx:32