22 #define ali_assert(cond) arb_assert(cond)
31 const T*
start()
const {
return bstart; }
32 size_t offset()
const {
return curr-bstart; }
34 T get() {
return *curr++; }
36 void put(
T c) { *curr++ = c; }
38 void put(
T c,
size_t count) {
39 memset(curr, c, count*
sizeof(
T));
43 memcpy(curr, source, count*
sizeof(
T));
53 operator const T*()
const {
return curr; }
54 operator T*() {
return curr; }
70 size_t length()
const {
return len; }
105 const char *msg =
GBS_global_string(
"You have not selected a destination alignment\n"
106 "Shall I create one ('%s_pro') for you?", ali_source);
108 error =
"Cancelled by user";
131 int spec_no_transl_info = 0;
134 int selected_ttable = -1;
140 memset(table_used, 0,
sizeof(table_used));
145 gb_species && !
error;
148 int arb_table, codon_start;
152 if (arb_table == -1) arb_table = selected_ttable;
153 table_used[arb_table] =
true;
158 table_used[selected_ttable] =
true;
162 if (!table_used[table])
continue;
165 gb_species && !
error;
168 bool found_transl_info =
false;
169 int startpos = selected_startpos;
172 int sp_arb_table, sp_codon_start;
178 if (sp_arb_table == -1) {
180 sp_arb_table = selected_ttable;
181 sp_codon_start = selected_startpos;
185 found_transl_info =
true;
189 if (sp_arb_table != table)
continue;
191 startpos = sp_codon_start;
195 if (!gb_source) { ++no_data; }
198 if (!gb_source_data) { ++no_data; }
207 if (!found_transl_info) ++spec_no_transl_info;
212 int least_stop_codons = -1;
215 for (cn = 0 ; cn < 3 ; cn++) {
216 stop_codons =
translate_nuc2aa(table, trial_data[cn], data_size, cn, translate_all,
false,
false,
NULp);
218 if ((stop_codons < least_stop_codons) ||
219 (least_stop_codons == -1))
221 least_stop_codons = stop_codons;
226 for (cn = 0 ; cn < 3 ; cn++) {
227 if (cn != startpos) {
228 free(trial_data[cn]);
232 data = trial_data[startpos];
233 stops += least_stop_codons;
248 if (!error && save_entries && !found_transl_info) {
263 if (spec_no_transl_info) {
266 "Defaults (%i and %i) have been used%s.",
268 embl_transl_table, selected_startpos+1,
269 save_entries ?
" and written to DB entries" :
""));
272 GB_warning(
"codon_start and transl_table entries were found for all translated taxa");
279 if ((count+no_data) == 0) {
280 GB_warning(
"Please mark species to translate");
284 count, (
double)stops/(
double)count));
303 void fillFrom(
int off) {
308 int leftX = xcount-off;
309 int leftDNA = left[off];
310 int minLeave = leftX-1;
311 int maxLeave = minLeave*3;
312 int minTake =
std::max(1, leftDNA-maxLeave);
314 #if defined(ASSERTION_USED)
315 int maxTake =
std::min(3, leftDNA-minLeave);
320 left[off+1] = left[off]-dist[off];
323 }
while (off<xcount);
327 bool incAt(
int off) {
331 if (dist[off] == 3) {
335 int leftX = xcount-off;
336 int leftDNA = left[off];
337 int minLeave = leftX-1;
338 int maxTake =
std::min(3, leftDNA-minLeave);
340 if (dist[off] == maxTake) {
353 dist(new
int[xcount]),
354 left(new
int[xcount+1]),
357 if (dnacount<xcount) {
358 error =
"not enough nucleotides";
360 else if (dnacount>3*xcount) {
361 error =
"too much nucleotides";
369 : xcount(other.xcount),
370 dist(new
int[xcount]),
371 left(new
int[xcount+1]),
374 memcpy(dist, other.dist,
sizeof(*dist)*xcount);
375 memcpy(left, other.left,
sizeof(*left)*(xcount+1));
391 int size()
const {
return xcount; }
396 for (
int incPos = xcount-2; incPos>=0; --incPos) {
397 if (incAt(incPos))
return true;
403 for (
int i = 0; i<xcount; ++i) {
404 if (dist[i] == 3)
return true;
411 for (
int i = 0; i<xcount; ++i) {
414 return score + 6 - dist[0] - dist[xcount-1];
424 bool translates =
true;
426 for (
int p = 0; translates && p<xcount; off += dist[p++]) {
429 translates =
AWT_is_codon(
'X', dna+off, allowed, this_remaining);
432 allowed = this_remaining;
436 if (translates) remaining = allowed;
450 int cmp(
const FailedAt& other)
const {
451 ptrdiff_t d = at_prot - other.at_prot;
452 if (!d) d = at_dna - other.at_dna;
453 return d<0 ? -1 : d>0 ? 1 : 0;
471 const char *
dna_at()
const {
return at_dna; }
473 operator bool()
const {
return !reason.empty(); }
477 reason =
string(prefix)+reason;
493 bool sync_behind_X_and_distribute(
const int x_count,
char *
const x_start,
const char *
const x_start_prot);
496 RealignAttempt(
const TransTables& allowed_,
const char *compressed_dna_,
size_t compressed_len_,
const char *aligned_protein_,
char *target_dna_,
size_t target_len_,
bool cutoff_dna_)
498 compressed_dna(compressed_dna_, compressed_len_),
499 aligned_protein(aligned_protein_),
500 target_dna(target_dna_, target_len_),
501 cutoff_dna(cutoff_dna_)
529 while (dist.next()) {
530 if (dist.get_score() > best.
get_score()) {
533 best_remaining = allowed;
542 best_remaining = curr_remaining;
547 error =
"no translating X-distribution found";
550 if (dist.translates_to_Xs(dna, allowed, curr_remaining)) {
552 best_remaining = curr_remaining;
557 }
while (dist.next());
559 while (dist.next()) {
560 if (dist.get_score() > best.
get_score()) {
561 if (dist.translates_to_Xs(dna, allowed, curr_remaining)) {
563 best_remaining = curr_remaining;
572 for (
int x = 0; x<best.
size(); ++x) {
573 while (xtarget[0] !=
'!') {
580 enum { UNDECIDED,
SPREAD, LEFT, RIGHT } mode = UNDECIDED;
582 bool is_1st_X = xtarget.
offset() == 0;
583 bool gaps_left = is_1st_X ? gap_before :
isGap(xtarget[-1]);
585 if (gaps_left) mode = LEFT;
587 bool is_last_X = x == best.
size()-1;
588 int next_nucs = is_last_X ? 0 : best[x+1];
589 bool gaps_right =
isGap(xtarget[3]) || next_nucs == 1 || (is_last_X && gap_after);
591 if (gaps_right) mode = RIGHT;
593 bool nogaps_right = next_nucs == 3 || (is_last_X && !gap_after);
595 mode = is_last_X ? LEFT : (is_1st_X ? RIGHT :
SPREAD);
609 case SPREAD: xtarget.
put(d1,
'-', d2);
break;
610 case LEFT: xtarget.
put(d1, d2,
'-');
break;
611 case RIGHT: xtarget.
put(
'-', d1, d2);
break;
616 case 1: xtarget.
put(
'-', dna.
get(),
'-');
break;
617 case 3: xtarget.
copy(dna, 3);
break;
624 remaining = best_remaining;
632 bool RealignAttempt::sync_behind_X_and_distribute(
const int x_count,
char *
const x_start,
const char *
const x_start_prot) {
640 bool complete =
false;
644 const char p = aligned_protein[-1];
646 size_t compressed_rest_len = compressed_dna.
restLength();
647 ali_assert(strlen(compressed_dna) == compressed_rest_len);
649 size_t min_dna = x_count;
650 size_t max_dna =
std::min(
size_t(x_count)*3, compressed_rest_len);
652 if (min_dna>max_dna) {
653 fail =
FailedAt(
"not enough nucs for X's at sequence end", x_start_prot, compressed_dna);
657 size_t target_rest_len = target_dna.
restLength();
659 for (
size_t x_dna = min_dna; x_dna<=max_dna; ++x_dna) {
660 const char *dna_rest = compressed_dna + x_dna;
661 size_t dna_rest_len = compressed_rest_len - x_dna;
666 RealignAttempt attemptRest(allowed, dna_rest, dna_rest_len, aligned_protein-1, target_dna, target_rest_len, cutoff_dna);
667 FailedAt restFailed = attemptRest.failed();
672 bool has_gap_before = x_start == target_dna.
start() ?
true :
isGap(x_start[-1]);
673 bool has_gap_after =
isGap(dna_rest[0]);
676 GB_ERROR disterr =
distribute_xdata(distrib_dna, x_count, x_start, has_gap_before, has_gap_after, attemptRest.get_remaining_tables(), remaining);
678 restFailed =
FailedAt(disterr, x_start_prot, dna_rest);
688 if (restFailed > foremost) foremost = restFailed;
700 if (!strstr(fail.
why(),
"Sync behind 'X'")) {
701 fail.
add_prefix(
"Sync behind 'X' failed foremost with: ");
709 GB_ERROR fail_reason =
"internal error: no distribution attempted";
712 for (x_dna = max_dna; x_dna>=min_dna; --x_dna) {
715 fail_reason =
distribute_xdata(append_dna, x_count, x_start,
false,
true, allowed, remaining);
724 fail =
FailedAt(fail_reason, x_start_prot+1, compressed_dna);
728 compressed_dna.
inc(x_dna);
737 void RealignAttempt::perform() {
738 bool complete =
false;
740 while (
char p = toupper(aligned_protein.
get())) {
742 char *x_start = target_dna;
743 const char *x_start_prot = aligned_protein-1;
747 if (p==
'X') { x_count++; target_dna.
put(
'!', 3); }
748 else if (
isGap(p)) target_dna.
put(p, 3);
751 p = toupper(aligned_protein.
get());
756 complete = sync_behind_X_and_distribute(x_count, x_start, x_start_prot);
757 if (!complete && !
failed()) {
759 fail =
FailedAt(
"internal error", aligned_protein-1, compressed_dna);
769 size_t compressed_rest_len = compressed_dna.
restLength();
771 if (compressed_rest_len<3) {
775 ali_assert(strlen(compressed_dna) == compressed_rest_len);
777 const char *why_fail;
778 if (!
AWT_is_codon(p, compressed_dna, allowed, remaining, &why_fail)) {
779 fail =
FailedAt(why_fail, aligned_protein-1, compressed_dna);
787 target_dna.
copy(compressed_dna, 3);
793 if (!
failed() && !complete) {
794 while (target_dna.
offset()>0 &&
isGap(target_dna[-1])) --target_dna;
797 size_t compressed_rest_len = compressed_dna.
restLength();
798 size_t target_rest_len = target_dna.
restLength();
799 if (compressed_rest_len<=target_rest_len) {
800 target_dna.
copy(compressed_dna, compressed_rest_len);
804 compressed_rest_len, target_rest_len),
805 aligned_protein-1, compressed_dna);
813 #if defined(ASSERTION_USED)
820 inline char *
unalign(
const char *data,
size_t len,
size_t& compressed_len) {
822 char *compressed = ARB_alloc<char>(len+1);
824 for (
size_t p = 0; p<len && data[p]; ++p) {
825 if (!
isGap(data[p])) {
826 compressed[compressed_len++] = data[p];
829 compressed[compressed_len] = 0;
834 const char *ali_source;
835 const char *ali_dest;
838 size_t needed_ali_len;
840 const char *fail_reason;
842 GB_ERROR annotate_fail_position(
const FailedAt& failed,
const char *source,
const char *dest,
const char *compressed_dest) {
843 int source_fail_pos = failed.
protein_at() - source;
844 int dest_fail_pos = 0;
846 int fail_d_base_count = failed.
dna_at() - compressed_dest;
848 const char *dp = dest;
858 dest_fail_pos = (dp-1)-dest;
859 if (!fail_d_base_count)
break;
866 ali_source,
info2bio(source_fail_pos),
871 static void calc_needed_dna(
const char *prot,
size_t len,
size_t& minDNA,
size_t& maxDNA) {
873 for (
size_t o = 0; o<len; ++o) {
874 char p = toupper(prot[o]);
879 else if (!
isGap(p)) {
885 static size_t countLeadingGaps(
const char *
buffer) {
887 for (
int o = 0;
isGap(buffer[o]); ++o) ++gaps;
892 Realigner(
const char *ali_source_,
const char *ali_dest_,
size_t ali_len_)
893 : ali_source(ali_source_),
906 const char *
failure()
const {
return fail_reason; }
908 char *
realign_seq(
TransTables& allowed,
const char *
const source,
size_t source_len,
const char *
const dest,
size_t dest_len,
bool cutoff_dna) {
911 size_t wanted_ali_len = source_len*3;
914 if (ali_len<wanted_ali_len) {
915 fail_reason =
GBS_global_string(
"Alignment '%s' is too short (increase its length to %zu)", ali_dest, wanted_ali_len);
916 if (wanted_ali_len>needed_ali_len) needed_ali_len = wanted_ali_len;
920 size_t compressed_len;
921 char *compressed_dest =
unalign(dest, dest_len, compressed_len);
925 RealignAttempt attempt(allowed, compressed_dest, compressed_len, source, buffer, ali_len, cutoff_dna);
930 size_t min_dna, max_dna;
931 calc_needed_dna(source, source_len, min_dna, max_dna);
933 if (min_dna<compressed_len) {
934 size_t extra_dna = compressed_len-min_dna;
935 for (
size_t skip = 1; skip<=extra_dna; ++skip) {
936 RealignAttempt attemptSkipped(allowed, compressed_dest+skip, compressed_len-skip, source, buffer, ali_len, cutoff_dna);
937 if (!attemptSkipped.
failed()) {
940 size_t start_gaps = countLeadingGaps(buffer);
941 if (start_gaps<skip) {
943 skip), source, compressed_dest);
946 memcpy(buffer+(start_gaps-skip), compressed_dest, skip);
964 fail_reason = annotate_fail_position(failed, source, dest, compressed_dest);
967 free(compressed_dest);
969 ali_assert(contradicted(buffer, fail_reason));
1030 long no_of_realigned_species = 0;
1032 arb_progress progress(
"Re-aligner", no_of_marked_species);
1035 Realigner realigner(ali_source, ali_dest, ali_len);
1038 !error && gb_species;
1043 Data source(gb_species, ali_source);
1044 Data dest(gb_species, ali_dest);
1051 #if defined(ASSERTION_USED)
1052 bool has_valid_translation_info =
false;
1055 int arb_transl_table, codon_start;
1060 else if (arb_transl_table >= 0) {
1063 #if defined(ASSERTION_USED)
1064 has_valid_translation_info =
true;
1077 if (explicit_table_known >= 0) {
1078 const int codon_start = 0;
1081 #if defined(ASSERTION_USED)
1089 if (!error && unmark_succeeded)
GB_write_flag(gb_species, 0);
1099 no_of_realigned_species++;
1107 if (no_of_marked_species == 0) {
1108 GB_warning(
"Please mark some species to realign them");
1110 else if (no_of_realigned_species != no_of_marked_species) {
1111 long failed = no_of_marked_species-no_of_realigned_species;
1113 if (no_of_realigned_species) {
1114 GB_warningf(
"%li marked species failed to realign (%li succeeded)", failed, no_of_realigned_species);
1117 GB_warning(
"All marked species failed to realign");
1121 if (error) progress.
done();
1139 static void msg_to_string(
const char *msg) {
1144 static const char *translation_info(
GBDATA *gb_species) {
1145 int arb_transl_table;
1149 static SmartCharPtr
result;
1164 #define DNASEQ(name) GB_read_char_pntr(GBT_find_sequence(GBT_find_species(gb_main, name), "ali_dna"))
1165 #define PROSEQ(name) GB_read_char_pntr(GBT_find_sequence(GBT_find_species(gb_main, name), "ali_pro"))
1167 #define TRANSLATION_INFO(name) translation_info(GBT_find_species(gb_main, name))
1169 void TEST_realign() {
1177 enum TransResult { SAME, CHANGED };
1181 size_t neededLength = 0;
1184 struct transinfo_check {
1185 const char *species_name;
1186 const char *old_info;
1187 TransResult changed;
1188 const char *new_info;
1191 transinfo_check
info[] = {
1192 {
"BctFra12",
"t=0,cs=1", SAME,
NULp },
1193 {
"CytLyti6",
"t=9,cs=1", CHANGED,
"t=9,cs=0" },
1194 {
"TaxOcell",
"t=14,cs=1", CHANGED,
"t=14,cs=0" },
1195 {
"StrRamo3",
"t=0,cs=1", SAME,
NULp },
1196 {
"StrCoel9",
"t=0,cs=0", SAME,
NULp },
1197 {
"MucRacem",
"t=0,cs=1", CHANGED,
"t=0,cs=0" },
1198 {
"MucRace2",
"t=0,cs=1", CHANGED,
"t=0,cs=0" },
1199 {
"MucRace3",
"t=0,cs=0", SAME,
NULp },
1200 {
"AbdGlauc",
"t=0,cs=0", SAME,
NULp },
1201 {
"CddAlbic",
"t=0,cs=0", SAME,
NULp },
1209 for (
int i = 0; info[i].species_name; ++i) {
1210 const transinfo_check&
I = info[i];
1211 TEST_ANNOTATE(I.species_name);
1215 TEST_ANNOTATE(
NULp);
1218 error =
ALI_realign_marked(gb_main,
"ali_pro",
"ali_dna", neededLength,
false,
false);
1221 "Automatic re-align failed for 'BctFra12'\nReason: not enough nucs for X's at sequence end at ali_pro:40 / ali_dna:109\n"
1222 "Automatic re-align failed for 'StrRamo3'\nReason: not enough nucs for X's at sequence end at ali_pro:36 / ali_dna:106\n"
1223 "Automatic re-align failed for 'MucRace3'\nReason: Sync behind 'X' failed foremost with: Not all IUPAC-combinations of 'NCC' translate to 'T' (for trans-table 1) at ali_pro:28 / ali_dna:78\n"
1224 "3 marked species failed to realign (7 succeeded)\n"
1230 TEST_EXPECT_EQUAL(DNASEQ(
"BctFra12"),
"ATGGCTAAAGAGAAATTTGAACGTACCAAACCGCACGTAAACATTGGTACAATCGGTCACGTTGACCACGGTAAAACCACTTTGACTGCTGCTATCACTACTGTGTTG------------------");
1231 TEST_EXPECT_EQUAL(DNASEQ(
"CytLyti6"),
"-A-TGGCAAAGGAAACTTTTGATCGTTCCAAACCGCACTTAA---ATATAG---GTACTATTGGACACGTAGATCACGGTAAAACTACTTTAACTGCTGCTATTACAASAGTAT-T-----G....");
1232 TEST_EXPECT_EQUAL(DNASEQ(
"TaxOcell"),
"AT-GGCTAAAGAAACTTTTGACCGGTCCAAGCCGCACGTAAACATCGGCACGAT------CGGTCACGTGGACCACGGCAAAACGACTCTGACCGCTGCTATCACCACGGTGCT-G..........");
1233 TEST_EXPECT_EQUAL(DNASEQ(
"StrRamo3"),
"ATGTCCAAGACGGCATACGTGCGCACCAAACCGCATCTGAACATCGGCACGATGGGTCATGTCGACCACGGCAAGACCACGTTGACCGCCGCCATCACCAAGGTCCTC------------------");
1234 TEST_EXPECT_EQUAL(DNASEQ(
"StrCoel9"),
"ATGTCCAAGACGGCGTACGTCCGC-C--C--A-CC-TG--A----GGCACGATG-G-CC--C-GACCACGGCAAGACCACCCTGACCGCCGCCATCACCAAGGTC-C--T--------C.......");
1235 TEST_EXPECT_EQUAL(DNASEQ(
"MucRacem"),
"......ATGGGTAAAGAG---------AAGACTCACGTTAACGTCGTCGTCATTGGTCACGTCGATTCCGGTAAATCTACTACTACTGGTCACTTGATTTACAAGTGTGGTGGTATA-AA......");
1236 TEST_EXPECT_EQUAL(DNASEQ(
"MucRace2"),
"ATGGGTAAGGAG---------AAGACTCACGTTAACGTCGTCGTCATTGGTCACGTCGATTCCGGTAAATCTACTACTACTGGTCACTTGATTTACAAGTGTGGTGGT-ATNNNAT-AAA......");
1237 TEST_EXPECT_EQUAL(DNASEQ(
"MucRace3"),
"-----------ATGGGTAAAGAGAAGACTCACGTTRAYGTTGTCGTTATTGGTCACGTCRATTCCGGTAAGTCCACCNCCRCTGGTCACTTGATTTACAAGTGTGGTGGTATAA-A----------");
1238 TEST_EXPECT_EQUAL(DNASEQ(
"AbdGlauc"),
"ATGGGTAAA-G--A--A--A--A--G-AC--T-CACGTTAACGTCGTTGTCATTGGTCACGTCGATTCTGGTAAATCCACCACCACTGGTCATTTGATCTACAAGTGCGGTGGTATA-AA......");
1239 TEST_EXPECT_EQUAL(DNASEQ(
"CddAlbic"),
"ATG-GG-TAAA-GAA------------AAAACTCACGTTAACGTTGTTGTTATTGGTCACGTCGATTCCGGTAAATCTACTACCACCGGTCACTTAATTTACAAGTGTGGTGGTATA-AA......");
1242 for (
int i = 0; info[i].species_name; ++i) {
1243 const transinfo_check&
I = info[i];
1244 TEST_ANNOTATE(I.species_name);
1245 switch (I.changed) {
1256 TEST_ANNOTATE(
NULp);
1264 struct translate_check {
1265 const char *species_name;
1266 const char *original_prot;
1267 TransResult retranslation;
1268 const char *changed_prot;
1271 translate_check trans[] = {
1272 {
"CytLyti6",
"XWQRKLLIVPNRT*-I*-VLLDT*ITVKLL*SSLLZZYX-X.",
1273 CHANGED,
"XWQRKLLIVPNRT*-I*-VLLDT*ITVKLL*SSLLQZYX-X." },
1274 {
"TaxOcell",
"XG*SNFWPVQAARNHRHD--RSRGPRQBDSDRCYHHGAX-..",
1275 CHANGED,
"XG*SNFWPVQAARNHRHD--RSRGPRQNDSDRCYHHGAX..." },
1276 {
"MucRacem",
"..MGKE---KTHVNVVVIGHVDSGKSTTTGHLIYKCGGIX..", SAME,
NULp },
1277 {
"MucRace2",
"MGKE---KTHVNVVVIGHVDSGKSTTTGHLIYKCGGXXXK--",
1278 CHANGED,
"MGKE---KTHVNVVVIGHVDSGKSTTTGHLIYKCGGXXXK.." },
1279 {
"AbdGlauc",
"MGKXXXXXXXXHVNVVVIGHVDSGKSTTTGHLIYKCGGIX..", SAME,
NULp },
1280 {
"StrCoel9",
"MSKTAYVRXXXXXX-GTMXXXDHGKTTLTAAITKVXX--X..", SAME,
NULp },
1281 {
"CddAlbic",
"MXXXE----KTHVNVVVIGHVDSGKSTTTGHLIYKCGGIX..", SAME,
NULp },
1287 for (
int t = 0; trans[t].species_name; ++t) {
1288 const translate_check&
T = trans[t];
1289 TEST_ANNOTATE(T.species_name);
1292 TEST_ANNOTATE(
NULp);
1297 TEST_EXPECT_EQUAL(msgs,
"codon_start and transl_table entries were found for all translated taxa\n10 taxa converted\n 1.100000 stops per sequence found\n");
1300 for (
int t = 0; trans[t].species_name; ++t) {
1301 const translate_check& T = trans[t];
1302 TEST_ANNOTATE(T.species_name);
1303 switch (T.retranslation) {
1315 TEST_ANNOTATE(
NULp);
1340 error =
ALI_realign_marked(gb_main,
"ali_dna",
"ali_pro", neededLength,
false,
false);
1347 GBDATA *gb_TaxOcell_amino;
1360 struct realign_check {
1364 TransResult retranslation;
1365 const char *changed_prot;
1368 realign_check
seq[] = {
1371 {
"XG*SNFWPVQAARNHRHD--RSRGPRQNDSDRCYHHGAX-..",
"AT-GGCTAAAGAAACTTTTGACCGGTCCAAGCCGCACGTAAACATCGGCACGAT------CGGTCACGTGGACCACGGCAAAACGACTCTGACCGCTGCTATCACCACGGTGCT-G..........",
false, CHANGED,
1372 "XG*SNFWPVQAARNHRHD--RSRGPRQNDSDRCYHHGAX..." },
1374 {
"XG*SNFWPVQAARNHRHD--RSRGPRQNDSDRCYHHG.....",
"AT-GGCTAAAGAAACTTTTGACCGGTCCAAGCCGCACGTAAACATCGGCACGAT------CGGTCACGTGGACCACGGCAAAACGACTCTGACCGCTGCTATCACCACGGTGCTG...........",
false, CHANGED,
1375 "XG*SNFWPVQAARNHRHD--RSRGPRQNDSDRCYHHGAX..." },
1376 {
"XG*SNFWPVQAARNHRHD--RSRGPRQNDSDRCYHHG.....",
"AT-GGCTAAAGAAACTTTTGACCGGTCCAAGCCGCACGTAAACATCGGCACGAT------CGGTCACGTGGACCACGGCAAAACGACTCTGACCGCTGCTATCACCACGGT...............",
true, SAME,
NULp },
1378 {
"XG*SNFWPVQAARNHRHD--RSRGPRQNDSDRCYH-----..",
"AT-GGCTAAAGAAACTTTTGACCGGTCCAAGCCGCACGTAAACATCGGCACGAT------CGGTCACGTGGACCACGGCAAAACGACTCTGACCGCTGCTATCACCACGGTGCTG...........",
false, CHANGED,
1379 "XG*SNFWPVQAARNHRHD--RSRGPRQNDSDRCYHHGAX..." },
1380 {
"XG*SNFWPVQAARNHRHD--RSRGPRQNDSDRCY---H....",
"AT-GGCTAAAGAAACTTTTGACCGGTCCAAGCCGCACGTAAACATCGGCACGAT------CGGTCACGTGGACCACGGCAAAACGACTCTGACCGCTGCTAT---------CACCACGGTGCTG..",
false, CHANGED,
1381 "XG*SNFWPVQAARNHRHD--RSRGPRQNDSDRCY---HHGAX" },
1383 {
"---SNFWPVQAARNHRHD--RSRGPRQNDSDRCYHHGAX-..",
"-ATGGCTAAAGAAACTTTTGACCGGTCCAAGCCGCACGTAAACATCGGCACGAT------CGGTCACGTGGACCACGGCAAAACGACTCTGACCGCTGCTATCACCACGGTGCT-G..........",
false, CHANGED,
1384 "XG*SNFWPVQAARNHRHD--RSRGPRQNDSDRCYHHGAX..." },
1385 {
"...SNFWPVQAARNHRHD--RSRGPRQNDSDRCYHHGAX...",
".........AGAAACTTTTGACCGGTCCAAGCCGCACGTAAACATCGGCACGAT------CGGTCACGTGGACCACGGCAAAACGACTCTGACCGCTGCTATCACCACGGTGCT-G..........",
true, SAME,
NULp },
1388 {
"XG*SNFXXXXXXAXXXNHRHDXXXXXXPRQNDSDRCYHHGAX",
"AT-GGCTAAAGAAACTTT-TG-AC-CG-GT-CCAA-GCC-GC-ACGT-AAACATCGGCACGAT-CG-GT-CA-CG-TGGA-CCACGGCAAAACGACTCTGACCGCTGCTATCACCACGGTGCT-G.",
false, SAME,
NULp },
1389 {
"XG*SNFWPVQAARNHRHD-XXXXXX-PRQNDSDRCYHHGAX-",
"AT-GGCTAAAGAAACTTTTGACCGGTCCAAGCCGCACGTAAACATCGGCACGAT---CG-GT-CA-CG-TG-GA----CCACGGCAAAACGACTCTGACCGCTGCTATCACCACGGTGCT-G....",
false, CHANGED,
1390 "XG*SNFWPVQAARNHRHD-XXXXXX-PRQNDSDRCYHHGAX." },
1391 {
"XG*SNXLXRXQA-ARNHRHD-RXXVX-PRQNDSDRCYHHGAX",
"AT-GGCTAAAGAAACTT-TTGAC-CGGTC-CAAGCC---GCACGTAAACATCGGCACGAT---CGG-TCAC-GTG-GA---CCACGGCAAAACGACTCTGACCGCTGCTATCACCACGGTGCT-G.",
false, SAME,
NULp },
1392 {
"XG*SXXFXDXVQAXT*TSARXRSXVX-PRQNDSDRCYHHGAX",
"AT-GGCTAAAGA-A-AC-TTT-T-GACCG-GTCCAAGCCGC-ACGTAAACATCGGCACGA-T-CGGTCA-C-GTG-GA---CCACGGCAAAACGACTCTGACCGCTGCTATCACCACGGTGCT-G.",
false, SAME,
NULp },
1398 int arb_transl_table, codon_start;
1407 for (
int s = 0; seq[
s].seq; ++
s) {
1409 realign_check& S = seq[
s];
1416 error =
ALI_realign_marked(gb_main,
"ali_pro",
"ali_dna", neededLength,
false, S.cutoff);
1428 TEST_EXPECT_EQUAL(msgs,
"codon_start and transl_table entries were found for all translated taxa\n1 taxa converted\n 2.000000 stops per sequence found\n");
1431 TEST_EXPECT_EQUAL(msgs,
"codon_start and transl_table entries were found for all translated taxa\n1 taxa converted\n 0.000000 stops per sequence found\n");
1434 TEST_EXPECT_EQUAL(msgs,
"codon_start and transl_table entries were found for all translated taxa\n1 taxa converted\n 1.000000 stops per sequence found\n");
1437 switch (S.retranslation) {
1452 TEST_ANNOTATE(
NULp);
1462 struct realign_fail {
1464 const char *failure;
1467 #define ERRPREFIX "Automatic re-align failed for 'TaxOcell'\nReason: "
1468 #define ERRPREFIX_LEN 49
1470 #define FAILONE "All marked species failed to realign\n"
1475 realign_fail seq[] = {
1480 {
"XG*SNFXXXXXAXNHRHD--XXX-PRQNDSDRCYHHGAX-..",
"Sync behind 'X' failed foremost with: 'GGA' translates to 'G', not to 'P' at ali_pro:25 / ali_dna:70\n" FAILONE },
1481 {
"XG*SNFWPVQAARNHRHD--RSRGPRQNDSDRCYHHGAX-..XG*SNFWPVQAARNHRHD--RSRGPRQNDSDRCYHHGAX-..",
"Alignment 'ali_dna' is too short (increase its length to 252)\n" FAILONE },
1482 {
"XG*SNFWPVQAARNHRHD--XXX-PRQNDSDRCYHHGAX-..",
"Sync behind 'X' failed foremost with: 'GGA' translates to 'G', not to 'P' at ali_pro:25 / ali_dna:70\n" FAILONE },
1483 {
"XG*SNX-A-X-ARNHRHD--XXX-PRQNDSDRCYHHGAX-..",
"Sync behind 'X' failed foremost with: 'TGA' never translates to 'A' at ali_pro:8 / ali_dna:19\n" FAILONE },
1484 {
"XG*SXFXPXQAXRNHRHD--RSRGPRQNDSDRCYHHGAX-..",
"Sync behind 'X' failed foremost with: 'ACG' translates to 'T', not to 'R' at ali_pro:13 / ali_dna:36\n" FAILONE },
1485 {
"XG*SNFWPVQAARNHRHD-----GPRQNDSDRCYHHGAX-..",
"Sync behind 'X' failed foremost with: 'CGG' translates to 'R', not to 'G' at ali_pro:24 / ali_dna:61\n" FAILONE },
1486 {
"XG*SNFWPVQAARNHRHDRSRGPRQNDSDRCYHHGAXHHGA.",
"Sync behind 'X' failed foremost with: not enough nucs left for codon of 'H' at ali_pro:38 / ali_dna:117\n" FAILONE },
1487 {
"XG*SNFWPVQAARNHRHD--RSRGPRQNDSDRCY----H...",
"Sync behind 'X' failed foremost with: too much trailing DNA (10 nucs, but only 9 columns left) at ali_pro:43 / ali_dna:106\n" FAILONE },
1488 {
"--SNFWPVQAARNHRHD--RSRGPRQNDSDRCYHHGAX--..",
"Not enough gaps to place 8 extra nucs at start of sequence at ali_pro:1 / ali_dna:1\n" FAILONE },
1500 for (
int s = 0; seq[
s].seq; ++
s) {
1507 error =
ALI_realign_marked(gb_main,
"ali_pro",
"ali_dna", neededLength,
false,
false);
1517 TEST_ANNOTATE(
NULp);
1526 struct explicit_realign {
1552 const char*
const NO_TI =
"t=-1,cs=-1";
1554 explicit_realign example[] = {
1560 {
"LK",
"TTGAAG", -1, NO_TI,
NULp },
1562 {
"G",
"RGG", -1,
"t=10,cs=0",
NULp },
1565 {
"LK",
"YTRAAR", 2,
"t=2,cs=0",
"Not all IUPAC-combinations of 'YTR' translate to 'L' (for trans-table 3) at ali_pro:1 / ali_dna:1\n" },
1566 {
"LX",
"YTRAAR", -1, NO_TI,
NULp },
1567 {
"LXX",
"YTRAARATH", -1,
"t=14,cs=0",
NULp },
1568 {
"LXI",
"YTRAARATH", -1, NO_TI,
NULp },
1570 {
"LX",
"YTRAAR", 2,
"t=2,cs=0",
"Not all IUPAC-combinations of 'YTR' translate to 'L' (for trans-table 3) at ali_pro:1 / ali_dna:1\n" },
1571 {
"LK",
"YTRAAR", -1, NO_TI,
NULp },
1572 {
"LK",
"YTRAAR", 6,
"t=6,cs=0",
"Not all IUPAC-combinations of 'AAR' translate to 'K' (for trans-table 9) at ali_pro:2 / ali_dna:4\n" },
1573 {
"XK",
"YTRAAR", -1, NO_TI,
NULp },
1575 {
"XX",
"-YTRAAR", 0,
"t=0,cs=0",
NULp },
1576 {
"XXL",
"YTRAARTTG", 0,
"t=0,cs=0",
"Not enough gaps to place 2 extra nucs at start of sequence at ali_pro:1 / ali_dna:1\n" },
1577 {
"-XXL",
"-YTRA-AR-TTG", 0,
"t=0,cs=0",
NULp },
1578 {
"IXXL",
"ATTYTRAARTTG", 0,
"t=0,cs=0",
"Sync behind 'X' failed foremost with: 'RTT' never translates to 'L' (for trans-table 1) at ali_pro:4 / ali_dna:9\n" },
1579 {
"XX",
"-YTRAAR", -1, NO_TI,
NULp },
1580 {
"IXXL",
"ATTYTRAARTTG", -1, NO_TI,
"Sync behind 'X' failed foremost with: 'RTT' never translates to 'L' at ali_pro:4 / ali_dna:9\n" },
1582 {
"LX",
"YTRATH", -1, NO_TI,
NULp },
1583 {
"LX",
"YTRATH", 2,
"t=2,cs=0",
"Not all IUPAC-combinations of 'YTR' translate to 'L' (for trans-table 3) at ali_pro:1 / ali_dna:1\n" },
1584 {
"XX",
"YTRATH", 2,
"t=2,cs=0",
NULp },
1585 {
"XX",
"YTRATH", -1,
"t=2,cs=0",
NULp },
1589 {
"XX",
"AARATH", 14,
"t=14,cs=0",
NULp },
1590 {
"XX",
"AARATH", -1,
"t=14,cs=0",
NULp },
1591 {
"KI",
"AARATH", -1, NO_TI,
NULp },
1592 {
"KI",
"AARATH", 4,
"t=4,cs=0",
"Not all IUPAC-combinations of 'ATH' translate to 'I' (for trans-table 5) at ali_pro:2 / ali_dna:4\n" },
1593 {
"BX",
"AAWATH", -1,
"t=14,cs=0",
NULp },
1594 {
"RX",
"AGRATH", -1,
"t=2,cs=0",
NULp },
1595 {
"MX",
"TTGATH", -1,
"t=10,cs=0",
NULp },
1597 {
"XI",
"AARATH", 14,
"t=14,cs=0",
"Sync behind 'X' failed foremost with: Not all IUPAC-combinations of 'ATH' translate to 'I' (for trans-table 21) at ali_pro:2 / ali_dna:4\n" },
1598 {
"KI",
"AARATH", 14,
"t=14,cs=0",
"Not all IUPAC-combinations of 'AAR' translate to 'K' (for trans-table 21) at ali_pro:1 / ali_dna:1\n" },
1604 {
"W*",
"TGATGA", 20,
"t=20,cs=0",
NULp },
1607 {
"E*",
"TAGTAG", 24,
"t=24,cs=0",
NULp },
1608 {
"E*",
"TAATAA", 24,
"t=24,cs=0",
NULp },
1609 {
"E*",
"TARTAR", 24,
"t=24,cs=0",
NULp },
1612 {
"W*",
"TGATGA", 21,
"t=21,cs=0",
NULp },
1613 {
"W*",
"TGGTGG", 21,
"t=21,cs=0",
"'TGG' translates to 'W', not to '*' at ali_pro:2 / ali_dna:4\n" },
1614 {
"W*",
"TGRTGR", 21,
"t=21,cs=0",
"Not all IUPAC-combinations of 'TGR' translate to '*' (for trans-table 28) at ali_pro:2 / ali_dna:4\n" },
1623 {
"MI",
"ATTATT", 8,
"t=8,cs=0",
NULp },
1624 {
"MI",
"ATCATC", 8,
"t=8,cs=0",
NULp },
1625 {
"MI",
"ATAATA", 8,
"t=8,cs=0",
NULp },
1626 {
"MI",
"ATGATG", 8,
"t=8,cs=0",
"'ATG' translates to 'M', not to 'I' at ali_pro:2 / ali_dna:4\n" },
1627 {
"MM",
"ATGATG", 8,
"t=8,cs=0",
NULp },
1628 {
"MI",
"ATWATW", 8,
"t=8,cs=0",
NULp },
1629 {
"MI",
"ATHATH", 8,
"t=8,cs=0",
NULp },
1630 {
"MI",
"ATBATB", 8,
"t=8,cs=0",
"Not all IUPAC-combinations of 'ATB' translate to 'I' (for trans-table 11) at ali_pro:2 / ali_dna:4\n" },
1633 {
"LL",
"TTATTA", -1,
"t=-1,cs=-1",
NULp },
1634 {
"ML",
"TTATTA", -1,
"t=3,cs=0",
NULp },
1635 {
"**",
"TTATTA", -1,
"t=16,cs=0",
NULp },
1636 {
"*L",
"TTATTA", -1,
"t=-1,cs=-1",
"'TTA' does not translate to 'L' (for trans-table 23) at ali_pro:2 / ali_dna:4\n" },
1637 {
"*M",
"TTATTA", -1,
"t=-1,cs=-1",
"'TTA' does not translate to 'M' (for trans-table 23) at ali_pro:2 / ali_dna:4\n" },
1638 {
"M*",
"TTATTA", -1,
"t=-1,cs=-1",
"'TTA' does not translate to '*' (for trans-table 4) at ali_pro:2 / ali_dna:4\n" },
1643 for (
int e = 0; example[e].acids; ++e) {
1644 const explicit_realign& E = example[e];
1651 if (E.table == -1) {
1660 error =
ALI_realign_marked(gb_main,
"ali_pro",
"ali_dna", neededLength,
false,
false);
1664 string wanted_msgs =
string(E.msgs)+FAILONE;
1674 size_t expextedLen = strlen(E.dna);
1675 size_t seqlen = strlen(dnaseq);
1676 char *firstPart =
ARB_strndup(dnaseq, expextedLen);
1678 char *nothing =
unalign(dnaseq+expextedLen, seqlen-expextedLen, dna_behind);
1704 error =
ALI_realign_marked(gb_main,
"ali_pro",
"ali_dna", neededLength,
false,
false);
1706 TEST_EXPECT_EQUAL(msgs, ERRPREFIX
"Error while reading 'transl_table' (Illegal (or unsupported) value (666) in 'transl_table' (item='TaxOcell'))\n" FAILONE);
1711 for (
int i = 0; i<2; ++i) {
1723 error =
ALI_realign_marked(gb_main,
"ali_pro",
"ali_dna", neededLength,
false,
false);
1732 TEST_ANNOTATE(
NULp);
1738 #undef ERRPREFIX_LEN
1744 static const char *permOf(
const Distributor& dist) {
1746 static char buffer[MAXDIST+1];
1749 for (
int p = 0; p<dist.
size(); ++p) {
1750 buffer[p] =
'0'+dist[p];
1752 buffer[dist.
size()] = 0;
1763 return all().ofgroup(expected);
1766 void TEST_distributor() {
1827 #endif // UNIT_TESTS
GBDATA * GB_open(const char *path, const char *opent)
void GB_warning(const char *message)
GBDATA * GBT_first_marked_species(GBDATA *gb_main)
const char * protein_at() const
#define implicated(hypothesis, conclusion)
return string(buffer, length)
const char * failure() const
GB_ERROR GB_append_exportedError(GB_ERROR error)
GB_ERROR GB_write_string(GBDATA *gbd, const char *s)
long GBT_mark_all(GBDATA *gb_main, int flag)
GB_ERROR translate_removeInfo(GBDATA *gb_species)
void put(T c, size_t count)
GBDATA * GBT_get_alignment(GBDATA *gb_main, const char *aliname)
GB_ERROR ALI_translate_marked(GBDATA *gb_main, bool use_entries, bool save_entries, int selected_startpos, bool translate_all, const char *ali_source, const char *ali_dest)
RealignAttempt(const TransTables &allowed_, const char *compressed_dna_, size_t compressed_len_, const char *aligned_protein_, char *target_dna_, size_t target_len_, bool cutoff_dna_)
SizedBufferPtr< char > SizedWriteBuffer
char * ARB_strdup(const char *str)
bool operator>(const FailedAt &other) const
const char * GBS_global_string(const char *templat,...)
void forbidAllBut(int nr)
long GBT_get_alignment_len(GBDATA *gb_main, const char *aliname)
DECLARE_ASSIGNMENT_OPERATOR(Distributor)
char * realign_seq(TransTables &allowed, const char *const source, size_t source_len, const char *const dest, size_t dest_len, bool cutoff_dna)
void auto_subtitles(const char *prefix)
static GB_ERROR distribute_xdata(SizedReadBuffer &dna, size_t xcount, char *xtarget_, bool gap_before, bool gap_after, const TransTables &allowed, TransTables &remaining)
Realigner(const char *ali_source_, const char *ali_dest_, size_t ali_len_)
void add_prefix(const char *prefix)
char buffer[MESSAGE_BUFFERSIZE]
GBDATA * GB_get_father(GBDATA *gbd)
GB_ERROR GB_delete(GBDATA *&source)
GB_ERROR GBT_add_alignment_changekeys(GBDATA *gb_main, const char *ali)
GB_ERROR get_error() const
NOT4PERL GBDATA * GBT_add_data(GBDATA *species, const char *ali_name, const char *key, GB_TYPES type) __ATTR__DEPRECATED_TODO("better use GBT_create_sequence_data()")
#define TEST_EXPECT_CONTAINS(str, part)
size_t restLength() const
BufferPtr< T > & operator++()
size_t GB_read_string_count(GBDATA *gbd)
GB_ERROR GB_await_error()
NOT4PERL long * GBT_read_int(GBDATA *gb_container, const char *fieldpath)
void GB_warningf(const char *templat,...)
TYPE * ARB_alloc(size_t nelem)
bool mayFailTranslation() const
arb_handlers * active_arb_handlers
const TransTables & get_remaining_tables() const
int translate_nuc2aa(int arb_code_nr, char *data, size_t size, size_t pos, bool translate_all, bool create_start_codon, bool append_stop_codon, int *translatedSize)
GB_ERROR translate_saveInfo(GBDATA *gb_species, int arb_transl_table, int codon_start)
void set_failure(const char *reason)
int TTIT_arb2embl(int arb_code_nr)
#define TEST_REJECT_NULL(n)
Distributor(int xcount_, int dnacount)
static void error(const char *msg)
size_t get_needed_dest_alilen() const
Data(GBDATA *gb_species, const char *aliName)
expectation_group & add(const expectation &e)
GBDATA * GBT_next_marked_species(GBDATA *gb_species)
ASSERTING_CONSTEXPR_INLINE int info2bio(int infopos)
GB_alignment_type GBT_get_alignment_type(GBDATA *gb_main, const char *aliname)
GB_ERROR translate_getInfo(GBDATA *gb_item, int &arb_transl_table, int &codon_start)
FailedAt(GB_ERROR reason_, const char *at_prot_, const char *at_dna_)
SizedBufferPtr(T *b, size_t len_)
bool legal_ORF_pos(int p)
GBDATA * GBT_find_sequence(GBDATA *gb_species, const char *aliname)
long GBT_count_marked_species(GBDATA *gb_main)
T operator[](int i) const
GB_ERROR GB_print_error()
#define TEST_EXPECTATION(EXPCTN)
GB_ERROR ALI_realign_marked(GBDATA *gb_main, const char *ali_source, const char *ali_dest, size_t &neededLength, bool unmark_succeeded, bool cutoff_dna)
bool aw_ask_sure(const char *unique_id, const char *msg)
bool translates_to_Xs(const char *dna, TransTables allowed, TransTables &remaining) const
#define TEST_EXPECT_NULL(n)
GB_ERROR close(GB_ERROR error)
void GB_write_flag(GBDATA *gbd, long flag)
BufferPtr< T > & operator--()
char * ARB_strndup(const char *start, int len)
char * GB_read_string(GBDATA *gbd)
#define AWAR_PROTEIN_TYPE
void put(T c1, T c2, T c3)
int explicit_table() const
const char * dna_at() const
void ARB_install_handlers(arb_handlers &handlers)
#define TEST_EXPECT_NO_ERROR(call)
GBDATA * GBT_create_alignment(GBDATA *gb_main, const char *name, long len, long aligned, long security, const char *type, const char *why_created)
bool is_std_gap(const char c)
#define AUTODETECT_STARTPOS
const FailedAt & failed() const
GBDATA * GBT_find_species(GBDATA *gb_main, const char *name)
#define TEST_EXPECT_ERROR_CONTAINS(call, part)
#define TEST_EXPECT_DIFFERENT(expr, want)
void copy(BufferPtr< const T > &source, size_t count)
SizedBufferPtr< const char > SizedReadBuffer
int GB_get_transaction_level(GBDATA *gbd)
Distributor(const Distributor &other)
arb_status_implementation status
int operator[](int off) const
GB_transaction ta(gb_var)
GB_CSTR GB_read_char_pntr(GBDATA *gbd)
bool AWT_is_codon(char protein, const char *const dna, const TransTables &allowed, TransTables &remaining, const char **fail_reason_ptr)
GB_ERROR GBT_check_data(GBDATA *Main, const char *alignment_name)
void AP_initialize_codon_tables()
GB_CSTR GBT_get_name_or_description(GBDATA *gb_item)
static int info[maxsites+1]
#define TEST_EXPECT_EQUAL(expr, want)
GBDATA * GB_entry(GBDATA *father, const char *key)
void inc_and_check_user_abort(GB_ERROR &error)
char * GBS_global_string_copy(const char *templat,...)
void GB_close(GBDATA *gbd)
char * unalign(const char *data, size_t len, size_t &compressed_len)
bool is_subset_of(const TransTables &other) const
GB_write_int const char s