18 AP_sequence_protein::AP_sequence_protein(
const AliView *aliview) :
25 AP_sequence_protein::~AP_sequence_protein() {
32 return new AP_sequence_protein(get_aliview());
64 #define PROTEINS_TO_TEST 22 // 26 plus gap and star, minus 3 illegal, 'X', 'B' and 'Z'
118 if (p ==
APP_X) {
return "X"; }
119 if (p ==
APP_DOT) {
return "."; }
124 const char *
readable =
"*ACDEFGHIKLMNPQRSTVWY-";
127 if (p&prot2test[b]) {
139 #define OMA_SLOW_LOWMEM
141 #if defined(ASSERTION_USED) && 0
142 #define OMA_DOUBLE_CHECK
145 #if defined(OMA_DOUBLE_CHECK)
146 # define IMPL_OMA_SLOW_LOWMEM
147 # define IMPL_OMA_FAST_BIGMEM
149 # if defined(OMA_SLOW_LOWMEM)
150 # define IMPL_OMA_SLOW_LOWMEM
152 # define IMPL_OMA_FAST_BIGMEM
159 #if defined(IMPL_OMA_FAST_BIGMEM)
166 #if defined(IMPL_OMA_SLOW_LOWMEM)
173 return AP_PROTEINS(one_mutation_away_0_7 [ p & 0xff] |
174 one_mutation_away_8_15 [(p>>8) & 0xff] |
175 one_mutation_away_16_23[(p>>16) & 0xff]);
182 #if defined(OMA_SLOW_LOWMEM)
185 return one_mutation_away[p];
190 if (min_mutations_initialized_for_codenr != code_nr) {
199 for (i = 0; i<3; ++i) {
200 if (dist->
patd[i] & D_bit)
break;
203 prot_mindist[
s][d] =
char(i);
208 #if defined(IMPL_OMA_FAST_BIGMEM)
209 memset(one_mutation_away, 0,
sizeof(one_mutation_away));
211 #if defined(IMPL_OMA_SLOW_LOWMEM)
212 memset(one_mutation_away_0_7, 0,
sizeof(one_mutation_away_0_7));
213 memset(one_mutation_away_8_15, 0,
sizeof(one_mutation_away_8_15));
214 memset(one_mutation_away_16_23, 0,
sizeof(one_mutation_away_16_23));
220 if (prot_mindist[
s][d] == 1) {
228 #if defined(IMPL_OMA_FAST_BIGMEM)
229 one_mutation_away[source] = oma;
231 #if defined(IMPL_OMA_SLOW_LOWMEM)
232 uint32_t idx = source & 0xff;
if (idx) one_mutation_away_0_7 [idx] = oma;
233 idx = (source>>8) & 0xff;
if (idx) one_mutation_away_8_15 [idx] = oma;
234 idx = (source>>16) & 0xff;
if (idx) one_mutation_away_16_23[idx] = oma;
238 #if defined(IMPL_OMA_FAST_BIGMEM)
239 for (
size_t i = 0; i<=
APP_MAX; ++i) {
246 if (j&1) oma =
AP_PROTEINS(oma|one_mutation_away[b]);
251 one_mutation_away[i] = oma;
255 #if defined(IMPL_OMA_SLOW_LOWMEM)
256 for (
int s = 0;
s<8;
s++) {
258 for (
int i=b+1; i<256; i++) {
260 one_mutation_away_0_7[i] =
AP_PROTEINS(one_mutation_away_0_7[i] | one_mutation_away_0_7[b]);
261 one_mutation_away_8_15[i] =
AP_PROTEINS(one_mutation_away_8_15[i] | one_mutation_away_8_15[b]);
262 one_mutation_away_16_23[i] =
AP_PROTEINS(one_mutation_away_16_23[i] | one_mutation_away_16_23[b]);
268 #if defined(IMPL_OMA_FAST_BIGMEM) && defined(DEBUG)
269 for (
size_t i = 0; i<=
APP_MAX; ++i) {
270 if (one_mutation_away[i] == 0) {
274 for (
size_t i = 0; i<=
APP_MAX; ++i) {
275 AP_PROTEINS two_mutations_away = one_mutation_away[one_mutation_away[i]];
277 if ((!gap && two_mutations_away !=
APP_X) || (gap && two_mutations_away !=
APP_DOT)) {
284 for (
size_t i = 0; i<=
APP_MAX; ++i) {
285 AP_PROTEINS three_mutations_away = one_mutation_away[one_mutation_away[one_mutation_away[i]]];
287 if ((!gap && three_mutations_away !=
APP_X) || (gap && three_mutations_away !=
APP_DOT)) {
295 #if defined(OMA_DOUBLE_CHECK)
296 for (
size_t i = 0; i<=
APP_MAX; ++i) {
302 min_mutations_initialized_for_codenr = code_nr;
311 void AP_sequence_protein::set(
const char *isequence) {
315 size_t sequence_len = get_sequence_length();
321 ap_assert(!get_filter()->does_bootstrap());
325 int left_bases = sequence_len;
335 for (
int idx = 0; idx<filter_len && left_bases; ++idx) {
340 #if defined(SHOW_SEQ)
344 if (c >=
'A' && c <=
'Z') p = prot2AP_PROTEIN[c-
'A'];
345 else if (c ==
'-') p =
APP_GAP;
346 else if (c ==
'.') p =
APP_DOT;
366 #if defined(SHOW_SEQ)
370 mark_sequence_set(
true);
373 void AP_sequence_protein::unset() {
374 delete [] mut2; mut2 =
NULp;
375 delete [] mut1; mut1 =
NULp;
376 delete [] seq_prot; seq_prot =
NULp;
377 mark_sequence_set(
false);
385 const AP_sequence_protein *left =
DOWNCAST(
const AP_sequence_protein*, lefts);
386 const AP_sequence_protein *right =
DOWNCAST(
const AP_sequence_protein*, rights);
388 size_t sequence_len = get_sequence_length();
405 char *mutpsite = mutation_per_site;
411 for (
size_t idx = 0; idx<sequence_len; ++idx) {
430 AP_PROTEINS max_cost_2 =
AP_PROTEINS((contained_in_any & reachable_from_both_with_2_mut) | reachable_from_both_with_1_mut);
432 if (contained_in_both) {
433 p[idx] = contained_in_both;
434 mut1[idx] = max_cost_1;
435 mut2[idx] = max_cost_2;
438 int mutations = INT_MAX;
443 | reachable_from_both_with_3_mut);
449 mut1[idx] = max_cost_2;
450 mut2[idx] = max_cost_3;
456 | reachable_from_both_with_2_mut);
461 mut1[idx] = max_cost_3;
462 mut2[idx] = max_cost_4;
470 mut1[idx] = max_cost_4;
471 mut2[idx] = reachable_from_any_with_2_mut;
474 ap_assert(mutations >= 1 && mutations <= 3);
476 if (mutpsite) mutpsite[idx] += mutations;
477 result += mutations * weights->
weight(idx);
487 #if defined(DEBUG) && 0
491 for (
long idx = P1; idx <= P2; ++idx) printf(
"%3i ", p1[idx]);
493 for (
long idx = P1; idx <= P2; ++idx) printf(
"%3i ", p2[idx]);
494 printf(
"\nCombine value: %f\n",
float(result));
499 #if defined(DEBUG) && 0
500 printf(
"\nCombine value: %f\n",
float(result));
504 mark_sequence_set(
true);
520 void AP_sequence_protein::partial_match(
const AP_combinableSeq *part_,
long *overlapPtr,
long *penaltyPtr)
const {
532 const AP_sequence_protein *part = (
const AP_sequence_protein *)part_;
540 for (min_end = get_sequence_length()-1; min_end >= 0; --min_end) {
544 for (; min_end >= 0; --min_end) {
550 for (; min_end >= 0; --min_end) {
563 for (max_start = 0; max_start <= min_end; ++max_start) {
567 for (; max_start <= min_end; ++max_start) {
573 for (; max_start <= min_end; ++max_start) {
581 if (max_start <= min_end) {
582 for (
long idx = max_start; idx <= min_end; ++idx) {
583 if ((pf[idx]&pp[idx]) == 0) {
590 for (
int t1 = 0; t1<PROTEINS_TO_TEST && mutations>1; ++t1) {
591 if (pf[idx] & prot2test[t1]) {
593 if (pp[idx] & prot2test[t2]) {
594 int mut = prot_mindist[t1][t2];
597 if (mutations < 2)
break;
604 penalty += weights->
weight(idx)*mutations;
607 overlap = (min_end-max_start+1)*3;
611 *overlapPtr = overlap;
612 *penaltyPtr = penalty;
615 AP_FLOAT AP_sequence_protein::count_weighted_bases()
const {
619 if (!sequence) wcount = -1.0;
622 size_t sequence_len = get_sequence_length();
626 for (
size_t idx = 0; idx<sequence_len; ++idx) {
628 sum += weights->
weight(idx) * 2.0;
631 sum += weights->
weight(idx);
639 uint32_t AP_sequence_protein::checksum()
const {
641 return GB_checksum(reinterpret_cast<const char *>(seq),
sizeof(*seq)*get_sequence_length(), 0,
NULp);
644 int AP_sequence_protein::cmp_combined(
const AP_combinableSeq *other)
const {
645 const AP_sequence_protein *sother =
DOWNCAST(
const AP_sequence_protein*, other);
650 size_t len = get_sequence_length();
652 for (
size_t i = 0; i<len; ++i) {
654 if (comp)
return comp;
static AP_PROTEINS prot2test[PROTEINS_TO_TEST]
static AP_PROTEINS one_mutation_away_8_15[256]
void GB_warning(const char *message)
static char prot_mindist[PROTEINS_TO_TEST][PROTEINS_TO_TEST]
const AWT_distance_meter * getDistanceMeter() const
CONSTEXPR_INLINE unsigned char safeCharIndex(char c)
bool notIsGap(AP_PROTEINS c)
const char * GBS_global_string(const char *templat,...)
const uchar * get_simplify_table() const
bool hasGap(AP_PROTEINS c)
static int prot_idx[PROTEINS_TO_TEST]
uint32_t GB_checksum(const char *seq, long length, int ignore_case, const char *exclude)
char buffer[MESSAGE_BUFFERSIZE]
#define DOWNCAST(totype, expr)
bool isGap(AP_PROTEINS c)
static int min_mutations_initialized_for_codenr
static AP_PROTEINS one_mutation_away_0_7[256]
static int weights[MAX_BASETYPES][MAX_BASETYPES]
size_t get_length() const
AP_PROTEINS calcOneMutationAway(AP_PROTEINS p)
static void update_min_mutations(int code_nr, const AWT_distance_meter *distance_meter)
AWT_translator * AWT_get_user_translator(GBDATA *gb_main)
static AP_PROTEINS one_mutation_away_16_23[256]
str readable(const copy< T > &v)
AP_PROTEINS oneMutationAway(AP_PROTEINS p)
static AP_PROTEINS prot2AP_PROTEIN[26]
STATIC_ASSERT(APP_MAX==4194303)
bool notHasGap(AP_PROTEINS c)
virtual Mutations combine_seq(const AP_combinableSeq *lefts, const AP_combinableSeq *rights, char *mutation_per_site=NULp)=0
bool use_position(size_t pos) const
GB_UINT4 weight(size_t idx) const
GBDATA * get_gb_main(DbSel db)
const char * readable_combined_protein(AP_PROTEINS p)
const AWT_PDP * getDistance(int idx) const
GB_write_int const char s