21 #define CONSENSUS_AWAR_SOURCE CAS_INTERNAL // not awar-constructable here
33 #ifdef COUNT_BASES_TABLE_SIZE
34 static long bases_table_size = 0L;
39 if (no_of_bases.shortTable) {
40 delete no_of_bases.shortTable;
41 no_of_bases.shortTable =
NULp;
42 #ifdef COUNT_BASES_TABLE_SIZE
43 bases_table_size -= (no_of_entries+1)*table_entry_size;
48 int new_size = (no_of_entries+1)*table_entry_size;
49 no_of_bases.shortTable = (
unsigned char*)
new char[new_size];
50 memset(no_of_bases.shortTable, 0, new_size);
51 #ifdef COUNT_BASES_TABLE_SIZE
52 bases_table_size += new_size;
53 printf(
"bases_table_size = %li\n", bases_table_size);
61 int *new_table =
new int[no_of_entries+1];
64 for (i=0; i<=no_of_entries; i++) {
65 new_table[i] = no_of_bases.shortTable[i];
67 delete [] no_of_bases.shortTable;
69 #ifdef COUNT_BASES_TABLE_SIZE
71 printf(
"bases_table_size = %li\n", bases_table_size);
74 no_of_bases.longTable = new_table;
78 #define INVALID_TABLE_OPERATION() GBK_terminatef("SepBaseFreq: invalid operation at %i", __LINE__)
81 ct_assert(no_of_entries==other.no_of_entries);
89 for (i=start; i<=end; i++) {
90 set_elem_short(i, get_elem_short(i)+other.get_elem_short(i));
99 for (i=start; i<=end; i++) {
100 set_elem_long(i, get_elem_long(i)+other.get_elem_short(i));
104 for (i=start; i<=end; i++) {
105 set_elem_long(i, get_elem_long(i)+other.get_elem_long(i));
111 ct_assert(no_of_entries==other.no_of_entries);
119 for (i=start; i<=end; i++) {
120 set_elem_short(i, get_elem_short(i)-other.get_elem_short(i));
129 for (i=start; i<=end; i++) {
130 set_elem_long(i, get_elem_long(i)-other.get_elem_short(i));
134 for (i=start; i<=end; i++) {
135 set_elem_long(i, get_elem_long(i)-other.get_elem_long(i));
141 ct_assert(no_of_entries==Sub.no_of_entries);
142 ct_assert(no_of_entries==Add.no_of_entries);
145 int end = range.
end();
156 for (i=start; i<=end; i++) {
157 set_elem_short(i, get_elem_short(i)-Sub.get_elem_short(i)+Add.get_elem_short(i));
167 for (i=start; i<=end; i++) {
168 set_elem_long(i, get_elem_long(i)-Sub.get_elem_short(i)+Add.get_elem_short(i));
172 for (i=start; i<=end; i++) {
173 set_elem_long(i, get_elem_long(i)-Sub.get_elem_short(i)+Add.get_elem_long(i));
179 for (i=start; i<=end; i++) {
180 set_elem_long(i, get_elem_long(i)-Sub.get_elem_long(i)+Add.get_elem_short(i));
184 for (i=start; i<=end; i++) {
185 set_elem_long(i, get_elem_long(i)-Sub.get_elem_long(i)+Add.get_elem_long(i));
198 for (i=start; i<=end; i++) {
199 if (get_elem_short(i)!=other.get_elem_short(i)) {
200 *firstDifferentPos = i;
207 for (i=start; i<=end; i++) {
208 if (get_elem_short(i)!=other.get_elem_long(i)) {
209 *firstDifferentPos = i;
218 for (i=start; i<=end; i++) {
219 if (get_elem_long(i)!=other.get_elem_short(i)) {
220 *firstDifferentPos = i;
227 for (i=start; i<=end; i++) {
228 if (get_elem_long(i)!=other.get_elem_long(i)) {
229 *firstDifferentPos = i;
245 for (i=end; i>=
start; i--) {
246 if (get_elem_short(i)!=other.get_elem_short(i)) {
247 *lastDifferentPos = i;
254 for (i=end; i>=
start; i--) {
255 if (get_elem_short(i)!=other.get_elem_long(i)) {
256 *lastDifferentPos = i;
265 for (i=end; i>=
start; i--) {
266 if (get_elem_long(i)!=other.get_elem_short(i)) {
267 *lastDifferentPos = i;
274 for (i=end; i>=
start; i--) {
275 if (get_elem_long(i)!=other.get_elem_long(i)) {
276 *lastDifferentPos = i;
292 no_of_bases.shortTable =
NULp;
293 if (maxseqlength)
init(maxseqlength);
297 delete [] no_of_bases.shortTable;
298 #ifdef COUNT_BASES_TABLE_SIZE
299 bases_table_size -= (no_of_entries+1)*table_entry_size;
300 printf(
"bases_table_size = %li\n", bases_table_size);
305 if (new_length!=no_of_entries) {
306 int min_length = new_length<no_of_entries ? new_length : no_of_entries;
307 int growth = new_length-no_of_entries;
310 unsigned char *new_table =
new unsigned char[new_length+1];
312 ct_assert(default_entry>=0 && default_entry<256);
314 memcpy(new_table, no_of_bases.shortTable, min_length*
sizeof(*new_table));
315 new_table[new_length] = no_of_bases.shortTable[no_of_entries];
317 for (
int e=no_of_entries; e<new_length; ++e) new_table[e] = default_entry;
320 delete [] no_of_bases.shortTable;
321 no_of_bases.shortTable = new_table;
324 int *new_table =
new int[new_length+1];
326 memcpy(new_table, no_of_bases.longTable, min_length*
sizeof(*new_table));
327 new_table[new_length] = no_of_bases.longTable[no_of_entries];
329 for (
int e=no_of_entries; e<new_length; ++e) {
330 new_table[e] = default_entry;
334 delete [] no_of_bases.longTable;
335 no_of_bases.longTable = new_table;
338 #ifdef COUNT_BASES_TABLE_SIZE
339 bases_table_size += growth*table_entry_size;
340 printf(
"bases_table_size = %li\n", bases_table_size);
342 no_of_entries = new_length;
346 #if defined(TEST_CHAR_TABLE_INTEGRITY) || defined(ASSERTION_USED)
352 for (i=0; i<no_of_entries; i++) {
353 if (get_elem_short(i)) {
359 for (i=0; i<no_of_entries; i++) {
360 if (get_elem_long(i)) {
368 #endif // ASSERTION_USED
373 long entries = range.
size();
374 char *new_buf = ARB_alloc<char>(entries+1);
376 build_consensus_string_to(new_buf, range, cbp);
377 new_buf[entries] = 0;
393 #define PERCENT(part, all) ((100*(part))/(all))
394 #define MAX_BASES_TABLES 41 // 25
398 const int left_idx = range.
start();
399 const int right_idx = range.
end();
401 #if defined(DEBUG_CONSENSUS)
402 #define DUMPINT(var) do { if (dumpcol) fprintf(stderr, "%s=%i ", #var, var); } while(0)
408 for (
int i=left_idx; i<=right_idx; i++) {
409 #if defined(DEBUG_CONSENSUS)
411 bool dumpcol = (column == 21);
415 const int o = i-left_idx;
420 int max_base_idx = -1;
422 for (
int j=0; j<used_bases_tables; j++) {
423 base[j] = linear_table(j)[i];
424 if (!
isGap(index_to_upperChar(j))) {
426 if (base[j]>max_base) {
433 int gaps = sequenceUnits-bases;
441 if (gaps == sequenceUnits) {
442 consensus_string[o] =
'=';
447 consensus_string[o] =
'-';
458 for (
int j=0; j<used_bases_tables; j++) {
459 int bchar = index_to_upperChar(j);
463 #if defined(DEBUG_CONSENSUS)
466 used_bases[no_of_bases++] = index_to_upperChar(j);
475 used_bases[no_of_bases] = 0;
482 int group_count[amino_groups];
484 for (
int j=0; j<amino_groups; j++) {
487 for (
int j=0; j<used_bases_tables; j++) {
488 unsigned char bchar = index_to_upperChar(j);
496 for (
int j=0; j<amino_groups; j++) {
497 if (group_count[j]>kcount) {
499 kcount = group_count[j];
517 kchar = index_to_upperChar(max_base_idx);
530 if (percent>=BK.
upper) {
531 consensus_string[o] = kchar;
533 else if (percent>=BK.
lower) {
534 consensus_string[o] = tolower(kchar);
537 consensus_string[o] =
'.';
542 #if defined(DEBUG_CONSENSUS)
543 if (dumpcol) fprintf(stderr,
"result='%c'\n", consensus_string[o]);
548 memset(consensus_string,
'?', right_idx-left_idx+1);
556 bool BaseFrequencies::initialized =
false;
558 uint8_t BaseFrequencies::unitsPerSequence = 1;
559 unsigned char BaseFrequencies::char_to_index_tab[
MAXCHARTABLE];
562 int BaseFrequencies::used_bases_tables = 0;
565 inline void BaseFrequencies::set_char_to_index(
unsigned char c,
int index) {
566 ct_assert(index>=0 && index<used_bases_tables);
567 char_to_index_tab[upper_index_chars[index] = toupper(c)] = index;
568 char_to_index_tab[tolower(c)] = index;
571 void BaseFrequencies::expand_tables() {
573 for (i=0; i<used_bases_tables; i++) {
574 linear_table(i).expand_table_entry_size();
579 using namespace iupac;
580 static SmartCharPtr aadef;
582 if (aadef.isNull()) {
585 for (
int c =
'A'; c<=
'Z'; ++c) {
598 aadef = strdup(buffer);
605 const char *groups =
NULp;
606 const char *ambiguous =
NULp;
608 ali_type = ali_type_;
612 groups =
"A,C,G,TU,";
613 ambiguous =
"MRWSYKVHDBN";
614 unitsPerSequence = 12;
618 groups =
"A,C,G,UT,";
619 ambiguous =
"MRWSYKVHDBN";
620 unitsPerSequence = 12;
625 unitsPerSequence = 1;
634 #if defined(ASSERTION_USED)
646 int gapcharcount = 0;
647 char *unique_gap_chars = strdup(gap_chars);
648 for (
int i = 0; gap_chars[i]; ++i) {
651 unique_gap_chars[gapcharcount++] = gap_chars[i];
655 used_bases_tables = gapcharcount;
657 for (
int i=0; groups[i]; i++) {
658 if (groups[i]==
',') {
666 unsigned char init_val = used_bases_tables-1;
667 memset(char_to_index_tab, init_val, MAXCHARTABLE*
sizeof(*char_to_index_tab));
669 for (
int i=0; groups[i]; i++) {
670 if (groups[i]==
',') {
675 set_char_to_index(groups[i], idx);
679 const char *align_string_ptr = unique_gap_chars;
681 char c = *align_string_ptr++;
683 set_char_to_index(c, idx++);
690 for (
int i = 0; ambiguous[i]; ++i) {
691 const char *contained =
iupac::decode(ambiguous[i], ali_type,
false);
695 amb.
indices = strlen(contained);
699 for (
int j = 0; j<amb.
indices; ++j) {
705 char_to_index_tab[toupper(ambiguous[i])] =
710 free(unique_gap_chars);
722 for (
int i=0; i<used_bases_tables; i++) {
731 for (i=0; i<used_bases_tables; i++) {
732 linear_table(i).
init(maxseqlength);
742 for (
int i=0; i<used_bases_tables; i++) {
743 char c = upper_index_chars[i];
754 *bases = b/unitsPerSequence;
758 *gaps = g/unitsPerSequence;
768 for (
int i=0; i<used_bases_tables; i++) {
769 char c = upper_index_chars[i];
773 minus += table(c)[
column];
780 maxb =
std::max(maxb, table(c)[column]);
792 if (sequenceUnits>gaps) {
793 mfreq = maxb/double(sequenceUnits-gaps);
800 #if defined(ASSERTION_USED)
814 for (i=0; i<used_bases_tables; i++) {
815 delete bases_table[i];
818 delete [] bases_table;
828 for (i=0; i<used_bases_tables; i++) {
833 linear_table(i).
lastDifference(other.linear_table(i), end+1, Size-1, &end);
835 for (; i<used_bases_tables; i++) {
840 linear_table(i).
lastDifference(other.linear_table(i), end+1, Size-1, &end);
854 if (other.ignore)
return;
855 if (other.
size()==0)
return;
857 add(other, 0, other.
size()-1);
860 if (other.ignore)
return;
868 for (i=0; i<used_bases_tables; i++) {
869 linear_table(i).
add(other.linear_table(i),
start, end);
872 sequenceUnits += other.sequenceUnits;
878 if (other.ignore)
return;
879 sub(other, 0, other.
size()-1);
882 void BaseFrequencies::sub(
const BaseFrequencies& other,
int start,
int end) {
883 if (other.ignore)
return;
890 for (i=0; i<used_bases_tables; i++) {
891 linear_table(i).
sub(other.linear_table(i),
start, end);
894 sequenceUnits -= other.sequenceUnits;
908 for (i=0; i<used_bases_tables; i++) {
909 linear_table(i).
sub_and_add(Sub.linear_table(i), Add.linear_table(i), range);
916 const unsigned long *range1 = (
const unsigned long *)string1;
917 const unsigned long *range2 = (
const unsigned long *)string2;
918 const int step =
sizeof(*range1);
920 int l = min_len/step;
921 int r = min_len%step;
926 int start = -1, end = -1;
928 for (i=0; i<l; i++) {
929 if (range1[i] != range2[i]) {
931 for (j=0; j<step; j++) {
932 if (string1[k+j] != string2[k+j]) {
944 for (j=0; j<r; j++) {
945 if (string1[k+j] != string2[k+j]) {
958 for (j=r-1; j>=0; j--) {
959 if (string1[k+j] != string2[k+j]) {
967 for (i=l-1; i>=m; i--) {
968 if (range1[i] != range2[i]) {
970 for (j=step-1; j>=0; j--) {
971 if (string1[k+j] != string2[k+j]) {
989 #define INC_DEC(incdec) do { \
990 int idx = char_to_index_tab[c]; \
991 if (idx<MAX_INDEX_TABLES) { \
992 table(c).incdec(offset, unitsPerSequence); \
995 Ambiguity& amb = ambiguity_table[idx-MAX_INDEX_TABLES]; \
996 for (int i = 0; i<amb.indices; ++i) { \
997 linear_table(amb.index[i]).incdec(offset, amb.increment); \
1002 void BaseFrequencies::incr_short(
unsigned char c,
int offset) {
1006 void BaseFrequencies::decr_short(
unsigned char c,
int offset) {
1010 void BaseFrequencies::incr_long(
unsigned char c,
int offset) {
1014 void BaseFrequencies::decr_long(
unsigned char c,
int offset) {
1023 prepare_to_add_elements(1);
1029 for (i=0; i<len; i++) {
1030 unsigned char c = scan_string[i];
1040 for (i=0; i<len; i++) {
1041 unsigned char c = scan_string[i];
1051 sequenceUnits += unitsPerSequence;
1063 for (i=0; i<len; i++) {
1064 unsigned char c = scan_string[i];
1074 for (i=0; i<len; i++) {
1075 unsigned char c = scan_string[i];
1085 sequenceUnits -= unitsPerSequence;
1092 int start = range.
start();
1093 int end = range.
end();
1097 for (
int i=start; i<=end; i++) {
1098 unsigned char o = old_string[i];
1099 unsigned char n = new_string[i];
1110 for (
int i=start; i<=end; i++) {
1111 unsigned char o = old_string[i];
1112 unsigned char n = new_string[i];
1127 for (
int c=0; c<used_bases_tables; c++) {
1136 #if defined(TEST_CHAR_TABLE_INTEGRITY) || defined(ASSERTION_USED)
1140 for (i=0; i<used_bases_tables; i++) {
1141 if (!linear_table(i).
empty()) {
1149 if (
empty())
return true;
1152 if (sequenceUnits<0) {
1153 fprintf(stderr,
"Negative # of sequences (%i) in BaseFrequencies\n",
added_sequences());
1157 const int table_size =
size();
1158 for (
int i=0; i<table_size; i++) {
1162 if (!bases && !gaps) {
1163 for (
int j=i+1; j<table_size; j++) {
1167 fprintf(stderr,
"Zero in BaseFrequencies at column %i\n", i);
1175 fprintf(stderr,
"bases+gaps should be equal to # of sequences at column %i (bases=%i, gaps=%i, added_sequences=%i)\n",
1185 #endif // TEST_CHAR_TABLE_INTEGRITY || ASSERTION_USED
1187 #if defined(TEST_CHAR_TABLE_INTEGRITY)
1189 void BaseFrequencies::test()
const {
1196 #endif // TEST_CHAR_TABLE_INTEGRITY
1207 #define SETUP(gapChars,alitype) BaseFrequencies::setup(gapChars,alitype)
1209 void TEST_char_table() {
1210 for (
int useAminoAcids = 0; useAminoAcids<=1; ++useAminoAcids) {
1211 const char *alphabeth = useAminoAcids ?
"ABCDEFGHIJKLMNPQRSTVWXYZ-" :
"ACGTN-";
1212 const int alphabeth_size = strlen(alphabeth);
1216 const int SEQLEN = 30;
1219 const char *gapChars =
".-";
1225 if (useAminoAcids) {
1227 TEST_EXPECT_EQUAL(
getAAgroupDefinition(),
"A,B,C,D,E,F,G,H,I,J,K,L,M,N,P,Q,R,S,T,V,W,Y,Z,X");
1234 const int LOOPS = 5;
1238 for (
int loop = 0; loop<5; ++loop) {
1242 for (
int loop = 0; loop<5; ++loop) {
1248 for (
int c = 0; c<SEQLEN; ++c) {
1249 seq[c] = alphabeth[
GB_random(alphabeth_size)];
1255 const int add_count = 300;
1256 char *added_seqs[add_count];
1260 if (useAminoAcids) {
1261 s1 =
"ABCDEFGHIJKLMNPQRSTVWXYZabcdef";
1262 s2 =
"ghijklmnpqrstvwxyzABCDEFGHIJKL";
1265 s1 =
"ACGTACGTAcgtacgtaCGTACGTACGTAC";
1266 s2 =
"MRWSYKVHDBNmrwsykvhdbnSYKVHDBN";
1270 for (
int a = 0; a<add_count; ++a) {
1271 tab.add(seq, SEQLEN);
1272 added_seqs[a] = strdup(seq);
1277 seq[sidx] = alphabeth[aidx];
1281 char *consensus =
tab.build_consensus_string(BK);
1283 tab.add(s1, SEQLEN);
1285 tab.sub_and_add(s1, s2, r);
1286 tab.sub_and_add(s2, consensus, r);
1287 tab.sub(consensus, SEQLEN);
1289 char *consensus2 =
tab.build_consensus_string(BK);
1300 char *consensus =
tab.build_consensus_string(BK);
1301 if (useAminoAcids) {
1302 switch (seed[loop]) {
1303 case 1080710223:
TEST_EXPECT_EQUAL(consensus,
"a.i.dXi.haai.hifdi.d.adXaa..dX");
break;
1304 case 290469779:
TEST_EXPECT_EQUAL(consensus,
".......I.D.A......-f..iXi...dX");
break;
1305 case 346019116:
TEST_EXPECT_EQUAL(consensus,
"if.a-hd.hd..a.di-ddf.i..aXIaI.");
break;
1306 case 1708473987:
TEST_EXPECT_EQUAL(consensus,
"aia..df...haDaadd.d.fha.ii.X..");
break;
1307 case 2080862653:
TEST_EXPECT_EQUAL(consensus,
"a.ia.id...a.aah.had......i..d.");
break;
1312 switch (seed[loop]) {
1313 case 1080710223:
TEST_EXPECT_EQUAL(consensus,
"W.gKyc.sR.urSkGcy.kuy......RY-");
break;
1314 case 290469779:
TEST_EXPECT_EQUAL(consensus,
".u-SW.Mgcu.uw-uu..-mmb.sgcRuY.");
break;
1315 case 346019116:
TEST_EXPECT_EQUAL(consensus,
"y-m.-Suy.c.-u.-g-y.cWgr.cyGrgY");
break;
1316 case 1708473987:
TEST_EXPECT_EQUAL(consensus,
".K.Kc--.cakyw.uW..a-.guuggwH-a");
break;
1317 case 2080862653:
TEST_EXPECT_EQUAL(consensus,
".sgaR..--.uu.cMuuycsk-u.cu.Wc.");
break;
1324 tab.add(s1, SEQLEN);
1326 tab.sub_and_add(s1, s2, r);
1327 tab.sub_and_add(s2, consensus, r);
1328 tab.sub(consensus, SEQLEN);
1330 char *consensus2 =
tab.build_consensus_string(BK);
1337 for (
int a = 0; a<add_count; a++) {
1338 tab.sub(added_seqs[a], SEQLEN);
1339 freenull(added_seqs[a]);
1342 char *consensus =
tab.build_consensus_string(BK);
1350 void TEST_SepBaseFreq() {
1356 for (
int i = 0; i<100; ++i) short1.inc_short(OFF1, 1);
1357 for (
int i = 0; i<70; ++i) short1.inc_short(OFF2, 1);
1358 for (
int i = 0; i<150; ++i) short2.inc_short(OFF1, 1);
1359 for (
int i = 0; i<80; ++i) short2.inc_short(OFF2, 1);
1362 long1.expand_table_entry_size();
1363 long2.expand_table_entry_size();
1364 for (
int i = 0; i<2000; ++i) long1.inc_long(OFF1, 1);
1365 for (
int i = 0; i<2500; ++i) long2.inc_long(OFF1, 1);
1377 longsum.expand_table_entry_size();
1400 #endif // UNIT_TESTS
#define INVALID_TABLE_OPERATION()
static GB_ERROR tab(GBL_command_arguments *args, bool pretab)
void add(const SepBaseFreq &other, int start, int end)
BaseFrequencies(int maxseqlength=0)
CONSTEXPR_INLINE unsigned char safeCharIndex(char c)
void change_table_length(int new_length, int default_entry)
char * build_consensus_string(PosRange r, const ConsensusBuildParams &cbp) const
void change_table_length(int new_length)
int firstDifference(const SepBaseFreq &other, int start, int end, int *firstDifferentPos) const
double max_frequency_at(int column, bool ignore_gaps) const
void dec_short(int offset, int decr)
#define MAX_TARGET_INDICES
char get_amino_consensus_char(Amino_Group ag)
char buffer[MESSAGE_BUFFERSIZE]
uint8_t index[MAX_TARGET_INDICES]
static const char * getAAgroupDefinition()
static HelixNrInfo * start
#define PERCENT(part, all)
void inc_short(int offset, int incr)
static void setup(const char *gap_chars, GB_alignment_type ali_type_)
const char * decode(char iupac, GB_alignment_type aliType, bool decode_amino_iupac_groups)
void init(int maxseqlength)
void GB_random_seed(unsigned seed)
void sub_and_add(const BaseFrequencies &Sub, const BaseFrequencies &Add, PosRange range)
void inc_long(int offset, int incr)
static bool isGap(char c)
const PosRange * changed_range(const BaseFrequencies &other) const
void GBK_terminate(const char *error) __ATTR__NORETURN
SepBaseFreq(int maxseqlength)
void bases_and_gaps_at(int column, int *bases, int *gaps) const
void scan_string(ED4_multi_species_manager *parent, ED4_reference_terminals &refterms, const char *str, int *index, arb_progress &progress)
void dec_long(int offset, int decr)
#define LONG_TABLE_ELEM_SIZE
void expand_table_entry_size()
#define SHORT_TABLE_ELEM_SIZE
#define TEST_EXPECTATION(EXPCTN)
int lastDifference(const SepBaseFreq &other, int start, int end, int *lastDifferentPos) const
void sub(const SepBaseFreq &other, int start, int end)
Amino_Group get_amino_group_for(char aa)
#define MAX_USED_BASES_TABLES
char encode(const char bases[], GB_alignment_type aliType)
void sub_and_add(const SepBaseFreq &Sub, const SepBaseFreq &Add, PosRange range)
void build_consensus_string_to(char *consensus_string, ExplicitRange range, const ConsensusBuildParams &BK) const
#define MAX_AMBIGUITY_CODES
#define TEST_EXPECT_EQUAL(expr, want)
int added_sequences() const