ARB
AP_seq_dna.cxx
Go to the documentation of this file.
1 #include "AP_seq_dna.hxx"
2 
3 #include <arb_mem.h>
4 #include <AP_pro_a_nucs.hxx>
5 #include <AP_filter.hxx>
6 #include <ARB_Tree.hxx>
7 
8 
9 inline bool hasGap(char c) { return c & AP_GAP; }
10 inline bool isGap(char c) { return c == AP_GAP; }
11 
12 inline bool notHasGap(char c) { return !hasGap(c); }
13 inline bool notIsGap(char c) { return !isGap(c); }
14 
15 // -------------------------------
16 // AP_sequence_parsimony
17 
18 char *AP_sequence_parsimony::table;
19 
20 AP_sequence_parsimony::AP_sequence_parsimony(const AliView *aliview) :
21  AP_combinableSeq(aliview),
22  seq_pars(NULp)
23 {}
24 
25 AP_sequence_parsimony::~AP_sequence_parsimony() {
26  free(seq_pars);
27 }
28 
29 AP_combinableSeq *AP_sequence_parsimony::dup() const {
30  return new AP_sequence_parsimony(get_aliview());
31 }
32 
33 int AP_sequence_parsimony::cmp_combined(const AP_combinableSeq *other) const {
34  const AP_sequence_parsimony *sother = DOWNCAST(const AP_sequence_parsimony*, other);
35 
36  const unsigned char *s1 = get_usequence();
37  const unsigned char *s2 = sother->get_usequence();
38 
39  size_t len = get_sequence_length();
40 
41  for (size_t i = 0; i<len; ++i) {
42  int comp = int(s1[i]) - int(s2[i]);
43  if (comp) return comp;
44  }
45 
46  return 0;
47 }
48 
49 void AP_sequence_parsimony::build_table() {
50  table = (char *)AP_create_dna_to_ap_bases();
51 }
52 
53 
54 
55 /* --------------------------------------------------------------------------------
56  * combine(const AP_sequence *lefts, const AP_sequence *rights)
57  * set(char *isequence)
58  *
59  * for wagner & fitch parsimony algorithm
60  *
61  * Note: is_set_flag is used by AP_tree_nlen::parsimony_rek()
62  * see ../../PARSIMONY/AP_tree_nlen.cxx@parsimony_rek
63 */
64 
65 // #define SHOW_SEQ
66 
67 void AP_sequence_parsimony::set(const char *isequence) {
68  size_t sequence_len = get_filter()->get_filtered_length();
69  ARB_alloc_aligned(seq_pars, sequence_len+1);
70  memset(seq_pars, AP_DOT, (size_t)sequence_len+1); // init with dots
71 
72  const uchar *simplify = get_filter()->get_simplify_table();
73  if (!table) this->build_table();
74 
75  const AP_filter *filt = get_filter();
76  if (filt->does_bootstrap()) {
77  size_t iseqlen = strlen(isequence);
78 
79  for (size_t i = 0; i<sequence_len; ++i) {
80  size_t pos = filt->bootstrapped_seqpos(i); // random indices (but same for all species)
81 
82  ap_assert(pos<iseqlen);
83  if (pos >= iseqlen) continue;
84 
85  unsigned char c = (unsigned char)isequence[pos];
86 
87 #if defined(SHOW_SEQ)
88  fputc(simplify[c], stdout);
89 #endif // SHOW_SEQ
90 
91  seq_pars[i] = table[simplify[c]];
92  }
93  }
94  else {
95  const size_t* base_pos = filt->get_filterpos_2_seqpos();
96 
97  for (size_t i = 0; i < sequence_len; ++i) {
98  size_t pos = base_pos[i];
99  unsigned char c = (unsigned char)isequence[pos];
100  seq_pars[i] = table[simplify[c]];
101 
102 #if defined(SHOW_SEQ)
103  fputc(simplify[c], stdout);
104 #endif // SHOW_SEQ
105  }
106  }
107 
108 #if defined(SHOW_SEQ)
109  fputc('\n', stdout);
110 #endif // SHOW_SEQ
111 
112  mark_sequence_set(true);
113 }
114 
115 void AP_sequence_parsimony::unset() {
116  freenull(seq_pars);
117  mark_sequence_set(false);
118 }
119 
126 template <class COUNT, class SITE>
127 static long do_combine(size_t sequence_len,
128  const char * __restrict p1,
129  const char * __restrict p2,
130  char * __restrict p,
131  COUNT count,
132  SITE site)
133 {
134  for (size_t idx = 0; idx<sequence_len; ++idx) { // LOOP_VECTORIZED=4 (ok, do_combine is used 4 times)
135  char c1 = p1[idx];
136  char c2 = p2[idx];
137 
138  char c = c1 & c2;
139  p[idx] = (c==0)?c1|c2:c;
140 
141  count.add(idx, c);
142  site.add(idx, c);
143  }
144 
145  return count.sum;
146 }
147 
148 template <class COUNT>
149 static long do_countMutations(size_t sequence_len,
150  const char * __restrict p1,
151  const char * __restrict p2,
152  COUNT count)
153 {
154  for (size_t idx = 0; idx<sequence_len; ++idx) { // LOOP_VECTORIZED=2 (ok, do_countMutations is used 2 times)
155  char c1 = p1[idx];
156  char c2 = p2[idx];
157 
158  char c = c1 & c2;
159 
160  count.add(idx, c);
161  }
162 
163  return count.sum;
164 }
165 
167  long sum;
168  count_unweighted():sum(0){}
169  void add(size_t, char c) {
170  sum += !c;
171  }
172 };
173 
175  long sum;
177 
178  count_weighted(const GB_UINT4 *w) : sum(0), weights(w) {}
179  void add(size_t idx, char c) {
180  sum += !c * weights[idx];
181  }
182 };
183 
185  void add(size_t, char) {}
186 };
187 
189  char *sites;
190  count_mutpsite(char *s) : sites(s) {}
191  void add(size_t idx, char c) {
192  // below code is equal to "if (!c) ++sites[idx]", the difference
193  // is that no branch is required and sites[idx] is always
194  // written, allowing vectorization.
195  //
196  // For unknown reasons gcc 4.8.1, 4.9.2 and 5.1.0
197  // refuses to vectorize 'c==0?1:0' or '!c'
198 
199  sites[idx] += ((c | -c) >> 7 & 1) ^ 1;
200  }
201 };
202 
203 #define NEVER_COMBINE_ASYNC
204 
205 #if defined(Cxx11)
206 # if !defined(NEVER_COMBINE_ASYNC)
207 # define ASYNC_COMBINE
208 # endif
209 #endif
210 
211 #if defined(ASYNC_COMBINE)
212 # include <future>
213 #endif
214 
215 class CombinableSeq : virtual Noncopyable {
216  // input (read-only):
217  size_t sequence_len;
218  const char *s1;
219  const char *s2;
220 
221  const GB_UINT4 *weights; // NULp -> unweighted
222 
223  // output:
224  char *out; // should not be shared!
225  char *mutation_per_site; // NULp -> do not count (Warning: shared memory -> do not modify unguarded!)
226 
227  Mutations calculate() const;
228 
229 #if defined(ASYNC_COMBINE)
230  std::future<Mutations> f;
231  void allow_async_calc(bool allow_async) {
232  ap_assert(!f.valid());
233  if (allow_async) {
234  f = std::async( [this]() { return calculate(); } );
235  ap_assert(f.valid());
236  }
237  }
238  Mutations calc_result() {
240  if (f.valid()) {
241  try {
242  result = f.get();
243  }
244  catch (std::system_error& serr) {
245  fprintf(stderr, "catched system_error %i: %s\n", serr.code().value(), serr.what());
246  result = -1;
247  }
248  }
249  else {
250  result = calculate();
251  }
252  return result;
253  }
254 #else
255 # if defined(NEVER_COMBINE_ASYNC)
256  void allow_async_calc(bool IF_ASSERTION_USED(allow_async)) {
257  ap_assert(!allow_async); // asynchronous calculation completely disabled atm
258  }
259 # else // !NEVER_COMBINE_ASYNC
260  void allow_async_calc(bool) {}
261 # endif
262  Mutations calc_result() { return calculate(); }
263 #endif
264 
265 public:
266  CombinableSeq(size_t seq_len, const char *seq1, const char *seq2, char *result, char *mutation_per_site_, const GB_UINT4 *weights_, bool allow_async) :
267  sequence_len(seq_len),
268  s1(seq1),
269  s2(seq2),
270  weights(weights_),
271  out(result),
272  mutation_per_site(mutation_per_site_)
273  {
274  // cannot calculate asynchronously if mutation_per_site is specified!
275  // (mutation_per_site is shared between all instances of CombinableSeq and gets modified by calculate())
276  allow_async_calc(allow_async && !mutation_per_site);
277  }
278 
280  return calc_result();
281  }
282 };
283 
284 Mutations CombinableSeq::calculate() const {
285  Mutations mutations;
286  if (!out) {
287  ap_assert(!mutation_per_site);
288  if (!weights) {
289  mutations = do_countMutations(sequence_len, s1, s2, count_unweighted());
290  }
291  else {
292  mutations = do_countMutations(sequence_len, s1, s2, count_weighted(weights));
293  }
294  }
295  else if (!weights) {
296  if (mutation_per_site) {
297  mutations = do_combine(sequence_len, s1, s2, out,
299  count_mutpsite(mutation_per_site));
300  }
301  else {
302  mutations = do_combine(sequence_len, s1, s2, out,
304  count_nothing());
305  }
306  }
307  else {
308  UNCOVERED(); // by unittests!
309  if (mutation_per_site) {
310  mutations = do_combine(sequence_len, s1, s2, out,
311  count_weighted(weights),
312  count_mutpsite(mutation_per_site));
313  }
314  else {
315  mutations = do_combine(sequence_len, s1, s2, out,
316  count_weighted(weights),
317  count_nothing());
318  }
319  }
320 
321  return mutations;
322 }
323 
324 Mutations AP_sequence_parsimony::combine_seq(const AP_combinableSeq *lefts, const AP_combinableSeq *rights, char *mutation_per_site) {
325  const AP_sequence_parsimony *left = DOWNCAST(const AP_sequence_parsimony*, lefts);
326  const AP_sequence_parsimony *right = DOWNCAST(const AP_sequence_parsimony*, rights);
327 
328  size_t sequence_len = get_sequence_length();
329  if (!seq_pars) {
330  ARB_alloc_aligned(seq_pars, sequence_len + 1);
331  }
332 
333  const GB_UINT4 *weights = get_weights()->is_unweighted() ? NULp : get_weights()->get_weights();
334  CombinableSeq cs(sequence_len, left->get_sequence(), right->get_sequence(), seq_pars, mutation_per_site, weights, false);
335  long result = cs.get_result();
336 
337  inc_combine_count();
338  mark_sequence_set(true);
339 
340  ap_assert(result >= 0);
341  return result;
342 }
343 
344 Mutations AP_sequence_parsimony::mutations_if_combined_with(const AP_combinableSeq *other) {
345  size_t sequence_len = get_sequence_length();
346  const GB_UINT4 *weights = get_weights()->is_unweighted() ? NULp : get_weights()->get_weights();
347 
348  const AP_sequence_parsimony *pother = DOWNCAST(const AP_sequence_parsimony*, other);
349 
350  CombinableSeq cs(sequence_len, get_sequence(), pother->get_sequence(), NULp, NULp, weights, false);
351  long result = cs.get_result();
352 
353  inc_combine_count();
354  ap_assert(result >= 0);
355  return result;
356 }
357 
358 void AP_sequence_parsimony::partial_match(const AP_combinableSeq *part_, long *overlapPtr, long *penaltyPtr) const {
359  // matches the partial sequences 'part_' against 'this'
360  // '*penaltyPtr' is set to the number of mismatches (possibly weighted)
361  // '*overlapPtr' is set to the number of base positions both sequences overlap
362  // example:
363  // fullseq 'XXX---XXX' 'XXX---XXX'
364  // partialseq '-XX---XX-' '---XXX---'
365  // overlap 7 3
366  //
367  // algorithm is similar to AP_sequence_parsimony::combine()
368  // Note: changes done here should also be be applied to AP_seq_protein.cxx@partial_match_impl
369 
370  const AP_sequence_parsimony *part = (const AP_sequence_parsimony *)part_;
371 
372  const char *pf = get_sequence();
373  const char *pp = part->get_sequence();
374 
375  const AP_weights *weights = get_weights();
376 
377  long min_end; // minimum of both last non-gap positions
378  for (min_end = get_sequence_length()-1; min_end >= 0; --min_end) {
379  char both = pf[min_end]|pp[min_end];
380  if (notHasGap(both)) { // last non-gap found
381  if (notHasGap(pf[min_end])) { // occurred in full sequence
382  for (; min_end >= 0; --min_end) { // search same in partial sequence
383  if (notHasGap(pp[min_end])) break;
384  }
385  }
386  else {
387  ap_assert(notHasGap(pp[min_end])); // occurred in partial sequence
388  for (; min_end >= 0; --min_end) { // search same in full sequence
389  if (notHasGap(pf[min_end])) break;
390  }
391  }
392  break;
393  }
394  }
395 
396  long penalty = 0;
397  long overlap = 0;
398 
399  if (min_end >= 0) {
400  long max_start; // maximum of both first non-gap positions
401  for (max_start = 0; max_start <= min_end; ++max_start) {
402  char both = pf[max_start]|pp[max_start];
403  if (notHasGap(both)) { // first non-gap found
404  if (notHasGap(pf[max_start])) { // occurred in full sequence
405  for (; max_start <= min_end; ++max_start) { // search same in partial
406  if (notHasGap(pp[max_start])) break;
407  }
408  }
409  else {
410  ap_assert(notHasGap(pp[max_start])); // occurred in partial sequence
411  for (; max_start <= min_end; ++max_start) { // search same in full
412  if (notHasGap(pf[max_start])) break;
413  }
414  }
415  break;
416  }
417  }
418 
419  if (max_start <= min_end) { // if sequences overlap
420  for (long idx = max_start; idx <= min_end; ++idx) {
421  if ((pf[idx]&pp[idx]) == 0) { // bases are distinct (aka mismatch)
422  penalty += weights->weight(idx);
423  }
424  }
425  overlap = min_end-max_start+1;
426  }
427  }
428 
429  *overlapPtr = overlap;
430  *penaltyPtr = penalty;
431 }
432 
433 AP_FLOAT AP_sequence_parsimony::count_weighted_bases() const {
434  static char *hits = NULp;
435  if (!hits) {
436  ARB_alloc(hits, 256);
437  memset(hits, 1, 256); // count ambiguous characters half
438 
439  hits[AP_A] = 2; // count real characters full
440  hits[AP_C] = 2;
441  hits[AP_G] = 2;
442  hits[AP_T] = 2;
443 
444  hits[AP_GAP] = 0; // don't count gaps
445  hits[AP_DOT] = 0; // don't count dots (and other stuff)
446  }
447 
448  const AP_weights *weights = get_weights();
449  const char *p = get_sequence();
450 
451  long sum = 0;
452  size_t sequence_len = get_sequence_length();
453 
454  for (size_t i = 0; i<sequence_len; ++i) {
455  sum += hits[safeCharIndex(p[i])] * weights->weight(i);
456  }
457 
458  AP_FLOAT wcount = sum * 0.5;
459  return wcount;
460 }
461 
462 uint32_t AP_sequence_parsimony::checksum() const {
463  const char *seq = get_sequence();
464  return GB_checksum(seq, sizeof(*seq)*get_sequence_length(), 0, NULp);
465 }
466 
string result
const size_t * get_filterpos_2_seqpos() const
Definition: AP_filter.hxx:91
static long do_combine(size_t sequence_len, const char *__restrict p1, const char *__restrict p2, char *__restrict p, COUNT count, SITE site)
Definition: AP_seq_dna.cxx:127
char * AP_create_dna_to_ap_bases()
CONSTEXPR_INLINE unsigned char safeCharIndex(char c)
Definition: dupstr.h:73
void add(size_t, char)
Definition: AP_seq_dna.cxx:185
Mutations get_result()
Definition: AP_seq_dna.cxx:279
bool notHasGap(char c)
Definition: AP_seq_dna.cxx:12
size_t bootstrapped_seqpos(size_t bpos) const
Definition: AP_filter.hxx:111
bool hasGap(char c)
Definition: AP_seq_dna.cxx:9
uint32_t GB_checksum(const char *seq, long length, int ignore_case, const char *exclude)
Definition: adstring.cxx:319
FILE * seq
Definition: rns.c:46
#define DOWNCAST(totype, expr)
Definition: downcast.h:141
double AP_FLOAT
Definition: AP_matrix.hxx:30
unsigned int GB_UINT4
Definition: arbdb_base.h:37
bool isGap(char c)
Definition: AP_seq_dna.cxx:10
TYPE * ARB_alloc(size_t nelem)
Definition: arb_mem.h:56
const GB_UINT4 * weights
Definition: AP_seq_dna.cxx:176
static long do_countMutations(size_t sequence_len, const char *__restrict p1, const char *__restrict p2, COUNT count)
Definition: AP_seq_dna.cxx:149
bool does_bootstrap() const
Definition: AP_filter.hxx:109
static int weights[MAX_BASETYPES][MAX_BASETYPES]
Definition: ClustalV.cxx:71
long Mutations
Definition: AP_sequence.hxx:99
CombinableSeq(size_t seq_len, const char *seq1, const char *seq2, char *result, char *mutation_per_site_, const GB_UINT4 *weights_, bool allow_async)
Definition: AP_seq_dna.cxx:266
fputc('\n', stderr)
count_mutpsite(char *s)
Definition: AP_seq_dna.cxx:190
void add(size_t idx, char c)
Definition: AP_seq_dna.cxx:191
#define ap_assert(cond)
static int COUNT(DictTree tree)
void add(size_t idx, char c)
Definition: AP_seq_dna.cxx:179
#define IF_ASSERTION_USED(x)
Definition: arb_assert.h:308
bool notIsGap(char c)
Definition: AP_seq_dna.cxx:13
void add(size_t, char c)
Definition: AP_seq_dna.cxx:169
unsigned char uchar
Definition: gde.hxx:21
#define NULp
Definition: cxxforward.h:116
count_weighted(const GB_UINT4 *w)
Definition: AP_seq_dna.cxx:178
GB_UINT4 weight(size_t idx) const
Definition: AP_filter.hxx:162
void ARB_alloc_aligned(TYPE *&tgt, size_t nelems)
Definition: arb_mem.h:37
#define UNCOVERED()
Definition: arb_assert.h:380
GB_write_int const char s
Definition: AW_awar.cxx:154