ARB
AP_seq_protein.cxx
Go to the documentation of this file.
1 #include "AP_seq_protein.hxx"
2 #include <AP_pro_a_nucs.hxx>
3 
4 #include <AP_filter.hxx>
5 #include <ARB_Tree.hxx>
6 
7 #include <arb_str.h>
8 #include <climits>
9 
10 inline bool hasGap(AP_PROTEINS c) { return c & APP_GAP; }
11 inline bool isGap(AP_PROTEINS c) { return c == APP_GAP; }
12 
13 inline bool notHasGap(AP_PROTEINS c) { return !hasGap(c); }
14 inline bool notIsGap(AP_PROTEINS c) { return !isGap(c); }
15 
16 // #define ap_assert(bed) arb_assert(bed)
17 
18 AP_sequence_protein::AP_sequence_protein(const AliView *aliview) :
19  AP_combinableSeq(aliview),
20  seq_prot(NULp),
21  mut1(NULp),
22  mut2(NULp)
23 {}
24 
25 AP_sequence_protein::~AP_sequence_protein() {
26  delete [] mut2;
27  delete [] mut1;
28  delete [] seq_prot;
29 }
30 
31 AP_combinableSeq *AP_sequence_protein::dup() const {
32  return new AP_sequence_protein(get_aliview());
33 }
34 
36  APP_A,
37  APP_B,
38  APP_C,
39  APP_D,
40  APP_E,
41  APP_F,
42  APP_G,
43  APP_H,
44  APP_I,
45  APP_ILLEGAL, // J
46  APP_K,
47  APP_L,
48  APP_M,
49  APP_N,
50  APP_ILLEGAL, // O
51  APP_P,
52  APP_Q,
53  APP_R,
54  APP_S,
55  APP_T,
56  APP_ILLEGAL, // U
57  APP_V,
58  APP_W,
59  APP_X,
60  APP_Y,
61  APP_Z
62 };
63 
64 #define PROTEINS_TO_TEST 22 // 26 plus gap and star, minus 3 illegal, 'X', 'B' and 'Z'
65 
66 static AP_PROTEINS prot2test[PROTEINS_TO_TEST] = { // uses same indexing as prot_idx
67  APP_STAR,
68  APP_A,
69  APP_C,
70  APP_D,
71  APP_E,
72  APP_F,
73  APP_G,
74  APP_H,
75  APP_I,
76  APP_K,
77  APP_L,
78  APP_M,
79  APP_N,
80  APP_P,
81  APP_Q,
82  APP_R,
83  APP_S,
84  APP_T,
85  APP_V,
86  APP_W,
87  APP_Y,
88  APP_GAP
89 };
90 
91 static int prot_idx[PROTEINS_TO_TEST] = { // uses same indexing as prot2test
92  // contains indexes for 'AWT_distance_meter->dist'
93  0, // *
94  1, // A
95  3, // C
96  4, // D
97  5, // E
98  6, // F
99  7, // G
100  8, // H
101  9, // I
102  10, // K
103  11, // L
104  12, // M
105  13, // N
106  14, // P
107  15, // Q
108  16, // R
109  17, // S
110  18, // T
111  19, // V
112  20, // W
113  21, // Y
114  23 // gap
115 };
116 
117 inline const char *readable_combined_protein(AP_PROTEINS p) {
118  if (p == APP_X) { return "X"; }
119  if (p == APP_DOT) { return "."; }
120 
121  static char buffer[PROTEINS_TO_TEST+1];
122  memset(buffer, 0, PROTEINS_TO_TEST+1);
123  char *bp = buffer;
124  const char *readable = "*ACDEFGHIKLMNPQRSTVWY-"; // same index as prot2test
125 
126  for (int b = 0; b<PROTEINS_TO_TEST; ++b) {
127  if (p&prot2test[b]) {
128  *bp++ = readable[b];
129  }
130  }
131  return buffer;
132 }
133 
136 
137 // OMA = one mutation away
138 // (speedup for huge table is approx. 4-7%)
139 #define OMA_SLOW_LOWMEM
140 
141 #if defined(ASSERTION_USED) && 0
142 #define OMA_DOUBLE_CHECK
143 #endif
144 
145 #if defined(OMA_DOUBLE_CHECK)
146 # define IMPL_OMA_SLOW_LOWMEM
147 # define IMPL_OMA_FAST_BIGMEM
148 #else
149 # if defined(OMA_SLOW_LOWMEM)
150 # define IMPL_OMA_SLOW_LOWMEM
151 # else
152 # define IMPL_OMA_FAST_BIGMEM
153 # endif
154 #endif
155 
156 STATIC_ASSERT(APP_MAX == 4194303);
157 STATIC_ASSERT(sizeof(AP_PROTEINS) == 4);
158 
159 #if defined(IMPL_OMA_FAST_BIGMEM)
160 
161 static AP_PROTEINS one_mutation_away[APP_MAX+1]; // contains all proteins that are <= 1 nuc-mutations away from protein-combination used as index
162 STATIC_ASSERT(sizeof(one_mutation_away) == 16777216); // ~ 16Mb
163 
164 #endif
165 
166 #if defined(IMPL_OMA_SLOW_LOWMEM)
167 
171 
173  return AP_PROTEINS(one_mutation_away_0_7 [ p & 0xff] |
174  one_mutation_away_8_15 [(p>>8) & 0xff] |
175  one_mutation_away_16_23[(p>>16) & 0xff]);
176 }
177 
178 #endif
179 
180 
182 #if defined(OMA_SLOW_LOWMEM)
183  return calcOneMutationAway(p);
184 #else
185  return one_mutation_away[p];
186 #endif
187 }
188 
189 static void update_min_mutations(int code_nr, const AWT_distance_meter *distance_meter) {
190  if (min_mutations_initialized_for_codenr != code_nr) {
191  for (int d = 0; d<PROTEINS_TO_TEST; ++d) {
192  int D = prot_idx[d];
193  int D_bit = 1<<D;
194 
195  for (int s = 0; s<PROTEINS_TO_TEST; ++s) {
196  const AWT_PDP *dist = distance_meter->getDistance(prot_idx[s]);
197 
198  int i;
199  for (i = 0; i<3; ++i) {
200  if (dist->patd[i] & D_bit) break;
201  }
202 
203  prot_mindist[s][d] = char(i);
204  }
205  }
206 
207 
208 #if defined(IMPL_OMA_FAST_BIGMEM)
209  memset(one_mutation_away, 0, sizeof(one_mutation_away));
210 #endif
211 #if defined(IMPL_OMA_SLOW_LOWMEM)
212  memset(one_mutation_away_0_7, 0, sizeof(one_mutation_away_0_7));
213  memset(one_mutation_away_8_15, 0, sizeof(one_mutation_away_8_15));
214  memset(one_mutation_away_16_23, 0, sizeof(one_mutation_away_16_23));
215 #endif
216 
217  for (int s = 0; s<PROTEINS_TO_TEST; ++s) {
218  AP_PROTEINS oma = APP_ILLEGAL;
219  for (int d = 0; d<PROTEINS_TO_TEST; ++d) {
220  if (prot_mindist[s][d] == 1) {
221  oma = AP_PROTEINS(oma|prot2test[d]);
222  }
223  }
224 
225  AP_PROTEINS source = prot2test[s];
226  oma = AP_PROTEINS(oma|source);
227 
228 #if defined(IMPL_OMA_FAST_BIGMEM)
229  one_mutation_away[source] = oma;
230 #endif
231 #if defined(IMPL_OMA_SLOW_LOWMEM)
232  uint32_t idx = source & 0xff; if (idx) one_mutation_away_0_7 [idx] = oma;
233  idx = (source>>8) & 0xff; if (idx) one_mutation_away_8_15 [idx] = oma;
234  idx = (source>>16) & 0xff; if (idx) one_mutation_away_16_23[idx] = oma;
235 #endif
236  }
237 
238 #if defined(IMPL_OMA_FAST_BIGMEM)
239  for (size_t i = 0; i<=APP_MAX; ++i) {
240  if (one_mutation_away[i] == APP_ILLEGAL) {
241  size_t j = i;
242  size_t b = 1;
243  AP_PROTEINS oma = APP_ILLEGAL;
244 
245  while (j) {
246  if (j&1) oma = AP_PROTEINS(oma|one_mutation_away[b]);
247  j >>= 1;
248  b <<= 1;
249  }
250 
251  one_mutation_away[i] = oma;
252  }
253  }
254 #endif
255 #if defined(IMPL_OMA_SLOW_LOWMEM)
256  for (int s = 0; s<8; s++) {
257  int b = 1<<s;
258  for (int i=b+1; i<256; i++) {
259  if (i & b) {
260  one_mutation_away_0_7[i] = AP_PROTEINS(one_mutation_away_0_7[i] | one_mutation_away_0_7[b]);
261  one_mutation_away_8_15[i] = AP_PROTEINS(one_mutation_away_8_15[i] | one_mutation_away_8_15[b]);
262  one_mutation_away_16_23[i] = AP_PROTEINS(one_mutation_away_16_23[i] | one_mutation_away_16_23[b]);
263  }
264  }
265  }
266 #endif
267 
268 #if defined(IMPL_OMA_FAST_BIGMEM) && defined(DEBUG)
269  for (size_t i = 0; i<=APP_MAX; ++i) {
270  if (one_mutation_away[i] == 0) {
271  fprintf(stderr, "oma[%s] is zero\n", readable_combined_protein(AP_PROTEINS(i)));
272  }
273  }
274  for (size_t i = 0; i<=APP_MAX; ++i) {
275  AP_PROTEINS two_mutations_away = one_mutation_away[one_mutation_away[i]];
276  bool gap = hasGap(AP_PROTEINS(i));
277  if ((!gap && two_mutations_away != APP_X) || (gap && two_mutations_away != APP_DOT)) {
278  // reached for a few amino-acid-combinations: C, F, C|F, K, M, K|M
279  // and for APP_ILLEGAL and APP_GAP as below for 3 mutations
280  fprintf(stderr, "tma[%s]", readable_combined_protein(AP_PROTEINS(i)));
281  fprintf(stderr, "=%s\n", readable_combined_protein(two_mutations_away));
282  }
283  }
284  for (size_t i = 0; i<=APP_MAX; ++i) {
285  AP_PROTEINS three_mutations_away = one_mutation_away[one_mutation_away[one_mutation_away[i]]];
286  bool gap = hasGap(AP_PROTEINS(i));
287  if ((!gap && three_mutations_away != APP_X) || (gap && three_mutations_away != APP_DOT)) {
288  // only reached for i==APP_ILLEGAL and i==APP_GAP (result is wrong for latter)
289  fprintf(stderr, "3ma[%s]", readable_combined_protein(AP_PROTEINS(i)));
290  fprintf(stderr, "=%s\n", readable_combined_protein(three_mutations_away));
291  }
292  }
293 #endif
294 
295 #if defined(OMA_DOUBLE_CHECK)
296  for (size_t i = 0; i<=APP_MAX; ++i) {
297  AP_PROTEINS p = AP_PROTEINS(i);
298  ap_assert(calcOneMutationAway(p) == one_mutation_away[p]);
299  }
300 #endif
301 
302  min_mutations_initialized_for_codenr = code_nr;
303  }
304 }
305 
306 
307 #if defined(DEBUG)
308 // #define SHOW_SEQ
309 #endif // DEBUG
310 
311 void AP_sequence_protein::set(const char *isequence) {
312  AWT_translator *translator = AWT_get_user_translator(get_aliview()->get_gb_main());
313  update_min_mutations(translator->CodeNr(), translator->getDistanceMeter());
314 
315  size_t sequence_len = get_sequence_length();
316 
317  seq_prot = new AP_PROTEINS[sequence_len+1];
318  mut1 = new AP_PROTEINS[sequence_len+1];
319  mut2 = new AP_PROTEINS[sequence_len+1];
320 
321  ap_assert(!get_filter()->does_bootstrap()); // bootstrapping not implemented for protein parsimony
322 
323  const AP_filter *filt = get_filter();
324  const uchar *simplify = filt->get_simplify_table();
325  int left_bases = sequence_len;
326  long filter_len = filt->get_length();
327 
328  ap_assert(filt);
329 
330  size_t oidx = 0; // index for output sequence
331 
332  // check if initialized for correct instance of translator:
333  ap_assert(min_mutations_initialized_for_codenr == AWT_get_user_translator()->CodeNr());
334 
335  for (int idx = 0; idx<filter_len && left_bases; ++idx) {
336  if (filt->use_position(idx)) {
337  char c = toupper(simplify[safeCharIndex(isequence[idx])]);
339 
340 #if defined(SHOW_SEQ)
341  fputc(c, stdout);
342 #endif // SHOW_SEQ
343 
344  if (c >= 'A' && c <= 'Z') p = prot2AP_PROTEIN[c-'A'];
345  else if (c == '-') p = APP_GAP;
346  else if (c == '.') p = APP_DOT;
347  else if (c == '*') p = APP_STAR;
348 
349  if (p == APP_ILLEGAL) {
350  GB_warning(GBS_global_string("Invalid sequence character '%c' replaced by dot", c));
351  p = APP_DOT;
352  }
353 
354  seq_prot[oidx] = p;
355  mut1[oidx] = oneMutationAway(p);
356  mut2[oidx] = oneMutationAway(mut1[oidx]);
357 
358  ++oidx;
359  --left_bases;
360  }
361  }
362 
363  ap_assert(oidx == sequence_len);
364  seq_prot[sequence_len] = APP_ILLEGAL;
365 
366 #if defined(SHOW_SEQ)
367  fputc('\n', stdout);
368 #endif // SHOW_SEQ
369 
370  mark_sequence_set(true);
371 }
372 
373 void AP_sequence_protein::unset() {
374  delete [] mut2; mut2 = NULp;
375  delete [] mut1; mut1 = NULp;
376  delete [] seq_prot; seq_prot = NULp;
377  mark_sequence_set(false);
378 }
379 
380 Mutations AP_sequence_protein::combine_seq(const AP_combinableSeq *lefts, const AP_combinableSeq *rights, char *mutation_per_site) {
381  // Note: changes done here should also be be applied to AP_seq_dna.cxx@combine_impl
382 
383  // now uses same algorithm as done till [877]
384 
385  const AP_sequence_protein *left = DOWNCAST(const AP_sequence_protein*, lefts);
386  const AP_sequence_protein *right = DOWNCAST(const AP_sequence_protein*, rights);
387 
388  size_t sequence_len = get_sequence_length();
389  if (!seq_prot) {
390  seq_prot = new AP_PROTEINS[sequence_len + 1];
391  mut1 = new AP_PROTEINS[sequence_len + 1];
392  mut2 = new AP_PROTEINS[sequence_len + 1];
393  }
394 
395  const AP_PROTEINS *p1 = left->get_sequence();
396  const AP_PROTEINS *p2 = right->get_sequence();
397 
398  const AP_PROTEINS *mut11 = left->get_mut1();
399  const AP_PROTEINS *mut21 = left->get_mut2();
400  const AP_PROTEINS *mut12 = right->get_mut1();
401  const AP_PROTEINS *mut22 = right->get_mut2();
402 
403  AP_PROTEINS *p = seq_prot;
404  const AP_weights *weights = get_weights();
405  char *mutpsite = mutation_per_site;
406 
407  long result = 0;
408  // check if initialized for correct instance of translator:
409  ap_assert(min_mutations_initialized_for_codenr == AWT_get_user_translator()->CodeNr());
410 
411  for (size_t idx = 0; idx<sequence_len; ++idx) {
412  AP_PROTEINS c1 = p1[idx];
413  AP_PROTEINS c2 = p2[idx];
414 
415  AP_PROTEINS onemut1 = mut11[idx];
416  AP_PROTEINS onemut2 = mut12[idx];
417  AP_PROTEINS twomut1 = mut21[idx];
418  AP_PROTEINS twomut2 = mut22[idx];
419 
420  ap_assert(c1 != APP_ILLEGAL);
421  ap_assert(c2 != APP_ILLEGAL);
422 
423  AP_PROTEINS contained_in_both = AP_PROTEINS(c1 & c2);
424  AP_PROTEINS contained_in_any = AP_PROTEINS(c1 | c2);
425 
426  AP_PROTEINS reachable_from_both_with_1_mut = AP_PROTEINS(onemut1 & onemut2);
427  AP_PROTEINS reachable_from_both_with_2_mut = AP_PROTEINS(twomut1 & twomut2);
428 
429  AP_PROTEINS max_cost_1 = AP_PROTEINS(contained_in_any & reachable_from_both_with_1_mut);
430  AP_PROTEINS max_cost_2 = AP_PROTEINS((contained_in_any & reachable_from_both_with_2_mut) | reachable_from_both_with_1_mut);
431 
432  if (contained_in_both) { // there are common proteins
433  p[idx] = contained_in_both; // store common proteins for both subtrees
434  mut1[idx] = max_cost_1;
435  mut2[idx] = max_cost_2;
436  }
437  else { // proteins are distinct
438  int mutations = INT_MAX;
439 
440  AP_PROTEINS reachable_from_both_with_3_mut = AP_PROTEINS((onemut1 & twomut2) | (onemut2 & twomut1)); // one with 1 mutation, other with 2 mutations
441 
442  AP_PROTEINS max_cost_3 = AP_PROTEINS(contained_in_any // = one w/o mutations, other with 3 mutations (=anything, i.e. & APP_DOT, skipped)
443  | reachable_from_both_with_3_mut);
444 
445  if (max_cost_1) {
446  // some proteins can be reached from both subtrees with 1 mutation
447  mutations = 1;
448  p[idx] = max_cost_1;
449  mut1[idx] = max_cost_2;
450  mut2[idx] = max_cost_3;
451  }
452  else {
453  AP_PROTEINS reachable_from_any_with_1_mut = AP_PROTEINS(onemut1 | onemut2);
454 
455  AP_PROTEINS max_cost_4 = AP_PROTEINS(reachable_from_any_with_1_mut // one with 1 mutation, other with 3 mutations (=anything, i.e. & APP_DOT, skipped)
456  | reachable_from_both_with_2_mut);
457  if (max_cost_2) {
458  // some proteins can be reached from both subtrees with 2 mutations
459  mutations = 2;
460  p[idx] = max_cost_2;
461  mut1[idx] = max_cost_3;
462  mut2[idx] = max_cost_4;
463  }
464  else {
465  ap_assert(max_cost_3);
466  AP_PROTEINS reachable_from_any_with_2_mut = AP_PROTEINS(twomut1 | twomut2);
467 
468  mutations = 3;
469  p[idx] = max_cost_3;
470  mut1[idx] = max_cost_4;
471  mut2[idx] = reachable_from_any_with_2_mut; // one with 2 mutations, other with 3 mutations
472  }
473  }
474  ap_assert(mutations >= 1 && mutations <= 3);
475 
476  if (mutpsite) mutpsite[idx] += mutations; // count mutations per site (unweighted)
477  result += mutations * weights->weight(idx); // count weighted or simple
478 
479  }
480 
481  AP_PROTEINS calc_mut1 = oneMutationAway(p[idx]);
482  mut1[idx] = AP_PROTEINS(mut1[idx] | calc_mut1);
483  AP_PROTEINS calc_mut2 = oneMutationAway(mut1[idx]);
484  mut2[idx] = AP_PROTEINS(mut2[idx] | calc_mut2);
485  }
486 
487 #if defined(DEBUG) && 0
488 #define P1 75
489 #define P2 90
490  printf("Seq1: ");
491  for (long idx = P1; idx <= P2; ++idx) printf("%3i ", p1[idx]);
492  printf("\nSeq2: ");
493  for (long idx = P1; idx <= P2; ++idx) printf("%3i ", p2[idx]);
494  printf("\nCombine value: %f\n", float(result));
495 #undef P1
496 #undef P2
497 #endif // DEBUG
498 
499 #if defined(DEBUG) && 0
500  printf("\nCombine value: %f\n", float(result));
501 #endif // DEBUG
502 
503  inc_combine_count();
504  mark_sequence_set(true);
505 
506  ap_assert(result >= 0);
507  return result;
508 }
509 
510 Mutations AP_sequence_protein::mutations_if_combined_with(const AP_combinableSeq *other) {
511  // Note: uses stupid brute-force implementation
512 
513  AP_combinableSeq *tmp = dup();
514  Mutations mut = tmp->combine_seq(this, other); // Note: calls inc_combine_count()
515 
516  delete tmp;
517  return mut;
518 }
519 
520 void AP_sequence_protein::partial_match(const AP_combinableSeq *part_, long *overlapPtr, long *penaltyPtr) const {
521  // matches the partial sequences 'part_' against 'this'
522  // '*penaltyPtr' is set to the number of mismatches (possibly weighted)
523  // '*overlapPtr' is set to the number of base positions both sequences overlap
524  // example:
525  // fullseq 'XXX---XXX' 'XXX---XXX'
526  // partialseq '-XX---XX-' '---XXX---'
527  // overlap 7 3
528  //
529  // algorithm is similar to AP_sequence_protein::combine()
530  // Note: changes done here should also be be applied to AP_seq_dna.cxx@partial_match_impl
531 
532  const AP_sequence_protein *part = (const AP_sequence_protein *)part_;
533 
534  const AP_PROTEINS *pf = get_sequence();
535  const AP_PROTEINS *pp = part->get_sequence();
536 
537  const AP_weights *weights = get_weights();
538 
539  long min_end; // minimum of both last non-gap positions
540  for (min_end = get_sequence_length()-1; min_end >= 0; --min_end) {
541  AP_PROTEINS both = AP_PROTEINS(pf[min_end]|pp[min_end]);
542  if (notHasGap(both)) { // last non-gap found
543  if (notHasGap(pf[min_end])) { // occurred in full sequence
544  for (; min_end >= 0; --min_end) { // search same in partial sequence
545  if (notHasGap(pp[min_end])) break;
546  }
547  }
548  else {
549  ap_assert(notHasGap(pp[min_end])); // occurred in partial sequence
550  for (; min_end >= 0; --min_end) { // search same in full sequence
551  if (notHasGap(pf[min_end])) break;
552  }
553  }
554  break;
555  }
556  }
557 
558  long penalty = 0;
559  long overlap = 0;
560 
561  if (min_end >= 0) {
562  long max_start; // maximum of both first non-gap positions
563  for (max_start = 0; max_start <= min_end; ++max_start) {
564  AP_PROTEINS both = AP_PROTEINS(pf[max_start]|pp[max_start]);
565  if (notHasGap(both)) { // first non-gap found
566  if (notHasGap(pf[max_start])) { // occurred in full sequence
567  for (; max_start <= min_end; ++max_start) { // search same in partial
568  if (notHasGap(pp[max_start])) break;
569  }
570  }
571  else {
572  ap_assert(notHasGap(pp[max_start])); // occurred in partial sequence
573  for (; max_start <= min_end; ++max_start) { // search same in full
574  if (notHasGap(pf[max_start])) break;
575  }
576  }
577  break;
578  }
579  }
580 
581  if (max_start <= min_end) { // if sequences overlap
582  for (long idx = max_start; idx <= min_end; ++idx) {
583  if ((pf[idx]&pp[idx]) == 0) { // bases are distinct (aka mismatch)
584  int mutations;
585  if (hasGap(AP_PROTEINS(pf[idx]|pp[idx]))) { // one is a gap
586  mutations = 3;
587  }
588  else {
589  mutations = INT_MAX;
590  for (int t1 = 0; t1<PROTEINS_TO_TEST && mutations>1; ++t1) { // with all proteins to test
591  if (pf[idx] & prot2test[t1]) { // if protein is contained in subtree
592  for (int t2 = 0; t2<PROTEINS_TO_TEST; ++t2) {
593  if (pp[idx] & prot2test[t2]) {
594  int mut = prot_mindist[t1][t2];
595  if (mut<mutations) {
596  mutations = mut;
597  if (mutations < 2) break; // minimum reached -- abort
598  }
599  }
600  }
601  }
602  }
603  }
604  penalty += weights->weight(idx)*mutations;
605  }
606  }
607  overlap = (min_end-max_start+1)*3;
608  }
609  }
610 
611  *overlapPtr = overlap;
612  *penaltyPtr = penalty;
613 }
614 
615 AP_FLOAT AP_sequence_protein::count_weighted_bases() const {
616  AP_FLOAT wcount;
617  const AP_PROTEINS *sequence = get_sequence();
618 
619  if (!sequence) wcount = -1.0;
620  else {
621  long sum = 0;
622  size_t sequence_len = get_sequence_length();
623 
624  const AP_weights *weights = get_weights();
625 
626  for (size_t idx = 0; idx<sequence_len; ++idx) {
627  if (notHasGap(sequence[idx])) {
628  sum += weights->weight(idx) * 2.0;
629  }
630  else if /*has gap but */ (notIsGap(sequence[idx])) {
631  sum += weights->weight(idx);
632  }
633  }
634  wcount = sum * 0.5;
635  }
636  return wcount;
637 }
638 
639 uint32_t AP_sequence_protein::checksum() const {
640  const AP_PROTEINS *seq = get_sequence();
641  return GB_checksum(reinterpret_cast<const char *>(seq), sizeof(*seq)*get_sequence_length(), 0, NULp);
642 }
643 
644 int AP_sequence_protein::cmp_combined(const AP_combinableSeq *other) const {
645  const AP_sequence_protein *sother = DOWNCAST(const AP_sequence_protein*, other);
646 
647  const AP_PROTEINS *s1 = get_sequence();
648  const AP_PROTEINS *s2 = sother->get_sequence();
649 
650  size_t len = get_sequence_length();
651 
652  for (size_t i = 0; i<len; ++i) {
653  int comp = long_cmp(s1[i], s2[i]);
654  if (comp) return comp;
655  }
656 
657  // ap_assert(0); // location is reached from unittests. mut1 and mut2 could be tested as well
658  return 0;
659 }
660 
static AP_PROTEINS prot2test[PROTEINS_TO_TEST]
string result
static AP_PROTEINS one_mutation_away_8_15[256]
void GB_warning(const char *message)
Definition: arb_msg.cxx:530
static char prot_mindist[PROTEINS_TO_TEST][PROTEINS_TO_TEST]
const AWT_distance_meter * getDistanceMeter() const
CONSTEXPR_INLINE unsigned char safeCharIndex(char c)
Definition: dupstr.h:73
bool notIsGap(AP_PROTEINS c)
#define PROTEINS_TO_TEST
const char * GBS_global_string(const char *templat,...)
Definition: arb_msg.cxx:203
const uchar * get_simplify_table() const
Definition: AP_filter.hxx:100
bool hasGap(AP_PROTEINS c)
static int prot_idx[PROTEINS_TO_TEST]
uint32_t GB_checksum(const char *seq, long length, int ignore_case, const char *exclude)
Definition: adstring.cxx:319
AP_PROTEINS
char buffer[MESSAGE_BUFFERSIZE]
Definition: seq_search.cxx:34
FILE * seq
Definition: rns.c:46
#define DOWNCAST(totype, expr)
Definition: downcast.h:141
bool isGap(AP_PROTEINS c)
double AP_FLOAT
Definition: AP_matrix.hxx:30
static int min_mutations_initialized_for_codenr
static AP_PROTEINS one_mutation_away_0_7[256]
int CodeNr() const
static int weights[MAX_BASETYPES][MAX_BASETYPES]
Definition: ClustalV.cxx:71
static void * D
Definition: mem.cxx:18
long Mutations
Definition: AP_sequence.hxx:99
fputc('\n', stderr)
size_t get_length() const
Definition: AP_filter.hxx:83
#define ap_assert(cond)
AP_PROTEINS calcOneMutationAway(AP_PROTEINS p)
static void update_min_mutations(int code_nr, const AWT_distance_meter *distance_meter)
AWT_translator * AWT_get_user_translator(GBDATA *gb_main)
static AP_PROTEINS one_mutation_away_16_23[256]
str readable(const copy< T > &v)
Definition: test_unit.h:345
AP_PROTEINS oneMutationAway(AP_PROTEINS p)
static AP_PROTEINS prot2AP_PROTEIN[26]
STATIC_ASSERT(APP_MAX==4194303)
unsigned char uchar
Definition: gde.hxx:21
bool notHasGap(AP_PROTEINS c)
#define NULp
Definition: cxxforward.h:116
long patd[3]
virtual Mutations combine_seq(const AP_combinableSeq *lefts, const AP_combinableSeq *rights, char *mutation_per_site=NULp)=0
bool use_position(size_t pos) const
Definition: AP_filter.hxx:85
GB_UINT4 weight(size_t idx) const
Definition: AP_filter.hxx:162
GBDATA * get_gb_main(DbSel db)
Definition: merge.hxx:88
const char * readable_combined_protein(AP_PROTEINS p)
const AWT_PDP * getDistance(int idx) const
CONSTEXPR_INLINE int long_cmp(const long L1, const long L2)
Definition: arbtools.h:199
GB_write_int const char s
Definition: AW_awar.cxx:154