ARB
AP_filter.cxx
Go to the documentation of this file.
1 // =============================================================== //
2 // //
3 // File : AP_filter.cxx //
4 // Purpose : //
5 // //
6 // Institute of Microbiology (Technical University Munich) //
7 // http://www.arb-home.de/ //
8 // //
9 // =============================================================== //
10 
11 #include "AP_filter.hxx"
12 #include <arbdb.h>
13 
14 // ------------------
15 // AP_filter
16 
17 void AP_filter::init(size_t size) {
18  filter_mask = new bool[size];
19  filter_len = size;
20  real_len = 0;
21  update = AP_timer();
23  simplify[0] = 0; // silence cppcheck-warning
24  bootstrap = NULp;
25  filterpos_2_seqpos = NULp;
26 
27 #if defined(ASSERTION_USED)
28  checked_for_validity = false;
29 #endif
30 }
31 
32 void AP_filter::make_permeable(size_t size) {
33  init(size);
34  real_len = filter_len;
35  for (size_t i = 0; i < size; i++) filter_mask[i] = true;
36 }
37 
38 void AP_filter::init_from_string(const char *ifilter, const char *zerobases, size_t size) {
39  init(size);
40 
41  bool char2mask[256];
42  size_t i;
43 
44  for (i = 0; i<256; ++i) char2mask[i] = true;
45  if (zerobases) {
46  for (i = 0; zerobases[i]; ++i) char2mask[safeCharIndex(zerobases[i])] = false;
47  }
48  else {
49  char2mask['0'] = false;
50  }
51 
52  real_len = 0;
53  for (i = 0; i < size && ifilter[i]; ++i) {
54  real_len += int(filter_mask[i] = char2mask[safeCharIndex(ifilter[i])]);
55  }
56  for (; i < size; i++) {
57  filter_mask[i] = true;
58  real_len++;
59  }
60 }
61 
62 
63 AP_filter::AP_filter(size_t size) {
64  make_permeable(size);
65 }
66 
68  : filter_mask(new bool[other.filter_len]),
69  filter_len(other.filter_len),
70  real_len(other.real_len),
71  update(other.update),
72  simplify_type(other.simplify_type),
73  bootstrap(NULp),
74  filterpos_2_seqpos(NULp)
75 {
76  memcpy(filter_mask, other.filter_mask, filter_len*sizeof(*filter_mask));
77  memcpy(simplify, other.simplify, sizeof(simplify)*sizeof(*simplify));
78  if (other.bootstrap) {
79  bootstrap = new size_t[real_len];
80  memcpy(bootstrap, other.bootstrap, real_len*sizeof(*bootstrap));
81  }
82  if (other.filterpos_2_seqpos) {
83  filterpos_2_seqpos = new size_t[real_len];
84  memcpy(filterpos_2_seqpos, other.filterpos_2_seqpos, real_len*sizeof(*filterpos_2_seqpos));
85  }
86 #if defined(ASSERTION_USED)
87  checked_for_validity = other.checked_for_validity;
88 #endif
89 }
90 
91 
92 
93 AP_filter::AP_filter(const char *ifilter, const char *zerobases, size_t size) {
94  if (!ifilter || !*ifilter) {
95  make_permeable(size);
96  }
97  else {
98  init_from_string(ifilter, zerobases, size);
99  }
100 }
101 
103  size_t size = other.get_length();
104  const bool *omask = other.filter_mask;
105 
106  init(size);
107  for (size_t i = 0; i < size; i++) {
108  real_len += (filter_mask[i] = !omask[i]);
109  }
110 }
111 
112 AP_filter::AP_filter(const AP_filter& f1, AF_Combine comb, const AP_filter& f2) {
113  size_t size = f1.get_length();
114  af_assert(size == f2.get_length());
115 
116  init(size);
117 
118  const bool *m1 = f1.filter_mask;
119  const bool *m2 = f2.filter_mask;
120 
121  switch (comb) {
122  case AND:
123  for (size_t i = 0; i<size; ++i) {
124  real_len += (filter_mask[i] = (m1[i] && m2[i]));
125  }
126  break;
127  case OR:
128  for (size_t i = 0; i<size; ++i) {
129  real_len += (filter_mask[i] = (m1[i] || m2[i]));
130  }
131  break;
132  case XOR:
133  for (size_t i = 0; i<size; ++i) {
134  real_len += (filter_mask[i] = (m1[i] ^ m2[i]));
135  }
136  break;
137  }
138 }
139 
141  delete [] bootstrap;
142  delete [] filter_mask;
143  delete [] filterpos_2_seqpos;
144 }
145 
146 char *AP_filter::to_string() const {
147  af_assert(checked_for_validity);
148 
149  char *data = ARB_alloc<char>(filter_len+1);
150 
151  for (size_t i=0; i<filter_len; ++i) {
152  data[i] = "01"[filter_mask[i]];
153  }
154  data[filter_len] = 0;
155 
156  return data;
157 }
158 
159 
161  if (type != simplify_type) {
162  int i;
163  for (i=0; i<32; i++) {
164  simplify[i] = '.';
165  }
166  for (; i<256; i++) { // LOOP_VECTORIZED // tested down to gcc 5.5.0 (may fail on older gcc versions)
167  simplify[i] = i;
168  }
169  switch (type) {
171  simplify[(unsigned char)'g'] = 'a';
172  simplify[(unsigned char)'G'] = 'A';
173  simplify[(unsigned char)'u'] = 'c';
174  simplify[(unsigned char)'t'] = 'c';
175  simplify[(unsigned char)'U'] = 'C';
176  simplify[(unsigned char)'T'] = 'C';
177  break;
179  af_assert(0); // not implemented or impossible!?
180  break;
182  break;
183  default:
184  af_assert(0);
185  break;
186  }
187 
188  simplify_type = type;
189  }
190 }
191 
192 void AP_filter::calc_filterpos_2_seqpos() {
193  af_assert(checked_for_validity);
194  af_assert(real_len>0);
195 
196  delete [] filterpos_2_seqpos;
197  filterpos_2_seqpos = new size_t[real_len];
198  size_t i, j;
199  for (i=j=0; i<filter_len; ++i) {
200  if (filter_mask[i]) {
201  filterpos_2_seqpos[j++] = i;
202  }
203  }
204 }
205 
207  af_assert(checked_for_validity);
208  af_assert(real_len>0);
209 
210  delete [] bootstrap;
211  bootstrap = new size_t[real_len];
212 
213  af_assert(filter_len < RAND_MAX);
214 
215  for (size_t i = 0; i<real_len; ++i) {
216  int r = GB_random(real_len);
217  af_assert(r >= 0); // otherwise overflow in random number generator
218  bootstrap[i] = r;
219  }
220 }
221 
222 char *AP_filter::blowup_string(const char *filtered_string, char fillChar) const {
226  af_assert(checked_for_validity);
227 
228  char *blownup = ARB_alloc<char>(filter_len+1);
229  size_t f = 0;
230 
231  for (size_t i = 0; i<filter_len; ++i) {
232  blownup[i] = use_position(i) ? filtered_string[f++] : fillChar;
233  }
234  blownup[filter_len] = 0;
235 
236  return blownup;
237 }
238 
239 char *AP_filter::filter_string(const char *fulllen_string) const {
243  af_assert(checked_for_validity);
244 
245  char *filtered = ARB_alloc<char>(real_len+1);
246  size_t f = 0;
247 
248  get_filterpos_2_seqpos(); // create if missing
249  for (size_t i = 0; i<real_len; ++i) {
250  size_t p = filterpos_2_seqpos[i];
251  filtered[f++] = fulllen_string[p];
252  }
253  filtered[f] = 0;
254 
255  return filtered;
256 }
257 
258 // -------------------
259 // AP_weights
260 
262  : len(fil->get_filtered_length()),
263  weights(NULp)
264 {
265 }
266 
267 AP_weights::AP_weights(const GB_UINT4 *w, size_t wlen, const AP_filter *fil)
268  : len(fil->get_filtered_length()),
269  weights(NULp)
270 {
272 
273  af_assert(wlen == fil->get_length());
274 
275  size_t i, j;
276  for (j=i=0; j<wlen; ++j) {
277  if (fil->use_position(j)) {
278  weights[i++] = w[j];
279  }
280  }
281  af_assert(j <= fil->get_length());
282  af_assert(i == fil->get_filtered_length());
283 }
284 
286  : len(other.len),
287  weights(NULp)
288 {
289  if (other.weights) {
291  memcpy(weights, other.weights, len*sizeof(*weights));
292  }
293 }
294 
296  free(weights);
297 }
298 
299 long AP_timer() {
300  static long time = 0;
301  return ++time;
302 }
303 
304 
305 // --------------------------------------------------------------------------------
306 
307 #ifdef UNIT_TESTS
308 #ifndef TEST_UNIT_H
309 #include <test_unit.h>
310 #endif
311 
312 #define TEST_EXPECT_EQUAL_FILTERS(f1,f2) do{ \
313  TEST_EXPECT_NO_ERROR((f1).is_invalid()); \
314  TEST_EXPECT_NO_ERROR((f2).is_invalid()); \
315  char *m1 = (f1).to_string(); \
316  char *m2 = (f2).to_string(); \
317  TEST_EXPECT_EQUAL(m1, m2); \
318  free(m2); \
319  free(m1); \
320  }while(0)
321 
322 void TEST_filter() {
323  const int LEN = 20;
324  const int MASK_BITCOUNT = 9;
325  const char *mask = "01100001110000110011";
326  const char *mask_inv = "10011110001111001100";
327  const char *mask_some = "00100101100101011001";
328  const char *seq = "MSKTAYTKVLFDRGSALDGK";
329  const char *seq_masked = "SK" "KVL" "SA""GK";
330  const char *blow_mask = "_SK____KVL____SA__GK";
331  const char *seq_masked_inv = "M""TAYT" "FDRG""LD";
332  const char *blow_mask_inv = "M__TAYT___FDRG__LD__";
333 
334  AP_filter f1(LEN);
335  AP_filter f2(mask, "0", LEN);
336  AP_filter n2(mask, "1", LEN);
337  AP_filter f3(mask_inv, "0", LEN);
338  AP_filter n3(mask_inv, "1", LEN);
339 
340  TEST_EXPECT_EQUAL(f1.get_length(), LEN);
341  TEST_EXPECT_EQUAL(f2.get_length(), LEN);
342  TEST_EXPECT_EQUAL(f3.get_length(), LEN);
343  TEST_EXPECT_EQUAL(n2.get_length(), LEN);
344  TEST_EXPECT_EQUAL(n3.get_length(), LEN);
345 
346  TEST_EXPECT_EQUAL(f1.get_filtered_length(), LEN);
347  TEST_EXPECT_EQUAL(f2.get_filtered_length(), MASK_BITCOUNT);
348  TEST_EXPECT_EQUAL(f3.get_filtered_length(), LEN-MASK_BITCOUNT);
349  TEST_EXPECT_EQUAL(n2.get_filtered_length(), LEN-MASK_BITCOUNT);
350  TEST_EXPECT_EQUAL(n3.get_filtered_length(), MASK_BITCOUNT);
351 
352  TEST_EXPECT_NO_ERROR(f1.is_invalid());
353  TEST_EXPECT_NO_ERROR(f2.is_invalid());
354  TEST_EXPECT_NO_ERROR(f3.is_invalid());
355  TEST_EXPECT_NO_ERROR(n2.is_invalid());
356  TEST_EXPECT_NO_ERROR(n3.is_invalid());
357 
358  TEST_EXPECT_EQUAL_STRINGCOPY__NOERROREXPORTED(f1.to_string(), "11111111111111111111");
359  TEST_EXPECT_EQUAL_STRINGCOPY__NOERROREXPORTED(f2.to_string(), mask);
360  TEST_EXPECT_EQUAL_STRINGCOPY__NOERROREXPORTED(f3.to_string(), mask_inv);
361  TEST_EXPECT_EQUAL_STRINGCOPY__NOERROREXPORTED(n2.to_string(), mask_inv);
362  TEST_EXPECT_EQUAL_STRINGCOPY__NOERROREXPORTED(n3.to_string(), mask);
363 
364  TEST_EXPECT_EQUAL_STRINGCOPY__NOERROREXPORTED(f1.filter_string(seq), seq);
365  TEST_EXPECT_EQUAL_STRINGCOPY__NOERROREXPORTED(f2.filter_string(seq), seq_masked);
366  TEST_EXPECT_EQUAL_STRINGCOPY__NOERROREXPORTED(f3.filter_string(seq), seq_masked_inv);
367  TEST_EXPECT_EQUAL_STRINGCOPY__NOERROREXPORTED(n2.filter_string(seq), seq_masked_inv);
368  TEST_EXPECT_EQUAL_STRINGCOPY__NOERROREXPORTED(n3.filter_string(seq), seq_masked);
369 
370  TEST_EXPECT_EQUAL_STRINGCOPY__NOERROREXPORTED(f1.blowup_string(seq, '_'), seq);
371  TEST_EXPECT_EQUAL_STRINGCOPY__NOERROREXPORTED(f2.blowup_string(seq_masked, '_'), blow_mask);
372  TEST_EXPECT_EQUAL_STRINGCOPY__NOERROREXPORTED(f3.blowup_string(seq_masked_inv, '_'), blow_mask_inv);
373 
374  // test inverting filters:
375  AP_filter i2(NOT, f2);
376  AP_filter i3(NOT, f3);
377 
378  TEST_EXPECT_EQUAL_FILTERS(i2, n2);
379  TEST_EXPECT_EQUAL_FILTERS(i3, n3);
380 
381  // test filter combination (AND + OR):
382  AP_filter s2(mask_some, "0", LEN);
383  AP_filter s3(NOT, s2);
384 
385  AP_filter as23(s2, AND, s3);
386  AP_filter os23(s2, OR, s3);
387 
388  TEST_EXPECT_ERROR_CONTAINS(as23.is_invalid(), "Sequence completely filtered out (no columns left)");
389 
390  TEST_EXPECT_EQUAL(as23.get_filtered_length(), 0);
391  TEST_EXPECT_EQUAL(os23.get_filtered_length(), LEN);
392 
393  AP_filter fs22(f2, AND, s2);
394  AP_filter fs23(f2, AND, s3);
395  AP_filter o2223(fs22, OR, fs23);
396 
397  TEST_EXPECT_EQUAL_FILTERS(o2223, f2);
398 
399  AP_filter x(fs22, XOR, fs23);
400  AP_filter xa1(AP_filter(fs22, AND, AP_filter(NOT, fs23)), OR, AP_filter(AP_filter(NOT, fs22), AND, fs23)); // = (a&&!b) || (!a&&b)
401  AP_filter xa2(AP_filter(fs22, OR, fs23), AND, AP_filter(NOT, AP_filter(fs22, AND, fs23))); // = (a||b) && !(a&&b)
402 
403  TEST_EXPECT_EQUAL_FILTERS(x, xa1);
404  TEST_EXPECT_EQUAL_FILTERS(x, xa2);
405 
406  TEST_EXPECT_EQUAL_STRINGCOPY__NOERROREXPORTED(x.to_string(), mask);
407 }
408 
409 #endif // UNIT_TESTS
410 
411 // --------------------------------------------------------------------------------
const size_t * get_filterpos_2_seqpos() const
Definition: AP_filter.hxx:91
GB_TYPES type
char * blowup_string(const char *filtered_string, char insert) const
Definition: AP_filter.cxx:222
long AP_timer()
Definition: AP_filter.cxx:299
CONSTEXPR_INLINE unsigned char safeCharIndex(char c)
Definition: dupstr.h:73
Definition: AP_filter.hxx:36
void enable_bootstrap()
Definition: AP_filter.cxx:206
FILE * seq
Definition: rns.c:46
AP_weights(const AP_filter *fil)
Definition: AP_filter.cxx:261
unsigned int GB_UINT4
Definition: arbdb_base.h:37
char * to_string() const
Definition: AP_filter.cxx:146
char * filter_string(const char *fulllen_string) const
Definition: AP_filter.cxx:239
static int weights[MAX_BASETYPES][MAX_BASETYPES]
Definition: ClustalV.cxx:71
size_t get_length() const
Definition: AP_filter.hxx:83
#define af_assert(cond)
Definition: AP_filter.hxx:24
int GB_random(int range)
Definition: admath.cxx:88
AWT_FILTER_SIMPLIFY
Definition: AP_filter.hxx:28
size_t get_filtered_length() const
Definition: AP_filter.hxx:82
void enable_simplify(AWT_FILTER_SIMPLIFY type)
Definition: AP_filter.cxx:160
AF_Not
Definition: AP_filter.hxx:35
#define TEST_EXPECT_NO_ERROR(call)
Definition: test_unit.h:1118
#define NULp
Definition: cxxforward.h:116
#define TEST_EXPECT_ERROR_CONTAINS(call, part)
Definition: test_unit.h:1114
AF_Combine
Definition: AP_filter.hxx:36
AP_filter(size_t size)
Definition: AP_filter.cxx:63
bool use_position(size_t pos) const
Definition: AP_filter.hxx:85
#define TEST_EXPECT_EQUAL(expr, want)
Definition: test_unit.h:1294
void ARB_alloc_aligned(TYPE *&tgt, size_t nelems)
Definition: arb_mem.h:37