21 #define MASTER_GAP_OPEN 50
22 #define CHEAP_GAP_OPEN 20 // penalty for creating a gap (if we have gaps in master)
24 #define DEFAULT_GAP_OPEN 30 // penalty for creating a gap
26 #define MASTER_GAP_EXTEND 18
27 #define CHEAP_GAP_EXTEND 5 // penalty for extending a gap (if we have gaps in master)
29 #define DEFAULT_GAP_EXTEND 10 // penalty for extending a gap
31 #define DEFAULT_IMPROBABLY_MUTATION 10 // penalty for mutations
32 #define DEFAULT_PROBABLY_MUTATION 4 // penalty for special mutations (A<->G,C<->U/T) (only if 'weighted')
34 #define DYNAMIC_PENALTIES // otherwise you get FIXED PENALTIES (=no cheap penalties)
38 #define MAX_GAP_OPEN_DISCOUNT (DEFAULT_GAP_OPEN-CHEAP_GAP_OPEN) // maximum subtracted from DEFAULT_GAP_OPEN
39 #define MAX_GAP_EXTEND_DISCOUNT (DEFAULT_GAP_EXTEND-CHEAP_GAP_EXTEND) // maximum subtracted from DEFAULT_GAP_EXTEND
41 #define MAXN 2 // Maximum number of sequences (both groups)
50 #define MAX_BASETYPES 21
73 #if defined(ASSERTION_USED)
75 #endif // ASSERTION_USED
93 # define IF_MATRIX_DUMP(xxx) xxx
94 # define DISPLAY_MATRIX_SIZE 3000
96 static int vertical [DISPLAY_MATRIX_SIZE+2][DISPLAY_MATRIX_SIZE+2];
97 static int verticalOpen [DISPLAY_MATRIX_SIZE+2][DISPLAY_MATRIX_SIZE+2];
98 static int diagonal [DISPLAY_MATRIX_SIZE+2][DISPLAY_MATRIX_SIZE+2];
99 static int horizontal [DISPLAY_MATRIX_SIZE+2][DISPLAY_MATRIX_SIZE+2];
100 static int horizontalOpen [DISPLAY_MATRIX_SIZE+2][DISPLAY_MATRIX_SIZE+2];
103 # define IF_MATRIX_DUMP(xxx)
107 #ifdef DYNAMIC_PENALTIES
108 long gaps = gaps_before_position[beforePosition-1];
120 #ifdef DYNAMIC_PENALTIES
121 long gaps = gaps_before_position[beforePosition-1];
133 if (length<=0)
return 0;
135 int beforePosition = atPosition;
136 int afterPosition = atPosition-1;
142 if (p1<p2 && beforePosition>1) {
175 #define UNKNOWN_ACID 255
193 #define cheap_if(cond) ((cond) ? 1 : 2)
194 static int baseCmp(
unsigned char c1,
unsigned char c2) {
199 #define COMPARABLE_BASES 5
201 if (c1==c2)
return 0;
216 case 2:
return cheap_if(c2==4 || c2==5);
218 case 4:
if (c2==5)
return 0;
220 case 5:
if (c2==4)
return 0;
229 if (isalpha(nucleic_maybe[i][c2])) {
231 if (match<bestMatch) bestMatch = match;
237 if (isalpha(nucleic_maybe[i][c1])) {
239 if (match<bestMatch) bestMatch = match;
256 const char *p1 = strchr(nucleic_acid_order, c1);
257 const char *p2 = strchr(nucleic_acid_order, c2);
262 if (p1 && p2)
return baseCmp(p1-nucleic_acid_order, p2-nucleic_acid_order);
286 -4, 1, 0, -1, 0, 0, 2,
287 -5, 0, 0, -1, 0, 1, 2, 4,
288 -5, 0, 0, -1, 0, 0, 1, 3, 4,
289 -5, -1, -1, 0, 0, -1, 1, 2, 2, 4,
290 -3, -1, -1, 0, -1, -2, 2, 1, 1, 3, 6,
291 -4, 0, -1, 0, -2, -3, 0, -1, -1, 1, 2, 6,
292 -5, 0, 0, -1, -1, -2, 1, 0, 0, 1, 0, 3, 5,
293 -5, -2, -1, -2, -1, -3, -2, -3, -2, -1, -2, 0, 0, 6,
294 -2, -1, 0, -2, -1, -3, -2, -2, -2, -2, -2, -2, -2, 2, 5,
295 -6, -3, -2, -3, -2, -4, -3, -4, -3, -2, -2, -3, -3, 4, 2, 6,
296 -2, -1, 0, -1, 0, -1, -2, -2, -2, -2, -2, -2, -2, 2, 4, 2, 4,
297 -4, -3, -3, -5, -4, -5, -4, -6, -5, -5, -2, -4, -5, 0, 1, 2, -1, 9,
298 0, -3, -3, -5, -3, -5, -2, -4, -4, -4, 0, -4, -4, -2, -1, -1, -2, 7, 10,
299 -8, -2, -5, -6, -6, -7, -4, -7, -7, -5, -3, 2, -3, -4, -5, -2, -6, 0, 0, 17
304 #if (defined(DISPLAY_DIFF) || defined(MATRIX_DUMP))
305 static void p_decode(
const unsigned char *naseq,
unsigned char *
seq,
int l) {
306 int len = strlen(amino_acid_order);
308 for (
int i=1; i<=l && naseq[i]; i++) {
310 seq[i] = amino_acid_order[naseq[i]];
314 static void n_decode(
const unsigned char *naseq,
unsigned char *
seq,
int l) {
315 int len = strlen(nucleic_acid_order);
317 for (
int i=1; i<=l && naseq[i]; i++) {
319 seq[i] = nucleic_acid_order[naseq[i]];
325 return "ClustalV-aligner: MAXLEN is dimensioned to small for this sequence";
329 if (error)
return NULp;
331 void *ret = malloc(bytes);
333 if (!ret) error =
"out of memory";
345 naa1[i] = (
int *)
ckalloc((max_seq_length+1)*
sizeof(
int), error);
346 naa2[i] = (
int *)
ckalloc((max_seq_length+1)*
sizeof(
int), error);
347 naas[i] = (
int *)
ckalloc((max_seq_length+1)*
sizeof(
int), error);
353 little_pam=big_pam=matptr[0];
354 for (
int i=0; i<210; ++i) {
356 little_pam=(little_pam<c) ? little_pam : c;
357 big_pam=(big_pam>c) ? big_pam : c;
359 for (
int i=0; i<210; ++i) {
364 xover = big_pam - nv;
374 for (
int i=0; i<20; ++i) {
375 for (
int j=0; j<=i; ++j) {
376 pam[i][j]=
pamo[pos++];
380 for (
int i=0; i<20; ++i) {
381 for (
int j=0; j<=i; ++j) {
398 weights[i][j] = big_pam - pam[i-1][j-1];
402 weights[0][i] =
xover;
403 weights[i][0] =
xover;
422 #if defined(ASSERTION_USED)
423 displ_size = (2*max_seq_length + 1);
424 #endif // ASSERTION_USED
426 displ = (
int *)
ckalloc((2*max_seq_length + 1) *
sizeof (
int), error);
429 zza = (
int *)
ckalloc((max_seq_length+1) *
sizeof (
int), error);
430 zzb = (
int *)
ckalloc((max_seq_length+1) *
sizeof (
int), error);
432 zzc = (
int *)
ckalloc((max_seq_length+1) *
sizeof (
int), error);
433 zzd = (
int *)
ckalloc((max_seq_length+1) *
sizeof (
int), error);
444 #if defined(ASSERTION_USED)
446 #endif // ASSERTION_USED
450 fa_assert(offset >= 0 && offset<
int(displ_size));
455 fa_assert(offset >= 0 && offset<
int(displ_size));
463 if (last_print<0 && print_ptr>0) {
474 unsigned char j =
seq_array[alist[1]][v2+jat-1];
475 return j<128 ? naas[j][v1+iat-1] : 0;
480 inline const unsigned char *lstr(
const unsigned char *
s,
int len) {
481 static unsigned char *lstr_ss = 0;
483 freeset(lstr_ss, (
unsigned char*)
strndup((
const char*)s, len));
487 inline const unsigned char *nstr(
unsigned char *cp,
int length) {
488 const unsigned char *s = lstr(cp, length);
489 (dnaflag ? n_decode : p_decode)(s-1, const_cast<unsigned char *>(s-1),
length);
493 inline void dumpMatrix(
int x0,
int y0,
int breite,
int hoehe,
int mitte_vert) {
494 char *sl = ARB_alloc<char>(hoehe+3);
495 char *ma = ARB_alloc<char>(breite+3);
497 sprintf(ma,
"-%s-", nstr(
seq_array[1]+x0, breite));
498 sprintf(sl,
"-%s-", nstr(
seq_array[2]+y0, hoehe));
504 for (b=0; b<=mitte_vert; b++) {
505 printf(
"%5c", ma[b]);
508 for (b++; b<=breite+1+1; b++) {
509 printf(
"%5c", ma[b-1]);
513 for (
int h=0; h<=hoehe+1; h++) {
514 printf(
"\n%c vertical: ", sl[h]);
515 for (
int b=0; b<=breite+1+1; b++) printf(
"%5i", vertical[b][h]);
516 printf(
"\n verticalOpen: ");
517 for (
int b=0; b<=breite+1+1; b++) printf(
"%5i", verticalOpen[b][h]);
518 printf(
"\n diagonal: ");
519 for (
int b=0; b<=breite+1+1; b++) printf(
"%5i", diagonal[b][h]);
520 printf(
"\n horizontal: ");
521 for (
int b=0; b<=breite+1+1; b++) printf(
"%5i", horizontal[b][h]);
522 printf(
"\n horizontalOpen:");
523 for (
int b=0; b<=breite+1+1; b++) printf(
"%5i", horizontalOpen[b][h]);
527 printf(
"--------------------\n");
534 static int diff(
int v1,
int v2,
int v3,
int v4,
int st,
int en) {
543 int display_matrix = 0;
548 # define DISPLAY_MATRIX_ELEMENTS ((DISPLAY_MATRIX_SIZE+2)*(DISPLAY_MATRIX_SIZE+2))
550 memset(vertical, -1, DISPLAY_MATRIX_ELEMENTS*
sizeof(
int));
551 memset(verticalOpen, -1, DISPLAY_MATRIX_ELEMENTS*
sizeof(
int));
552 memset(diagonal, -1, DISPLAY_MATRIX_ELEMENTS*
sizeof(
int));
553 memset(horizontal, -1, DISPLAY_MATRIX_ELEMENTS*
sizeof(
int));
554 memset(horizontalOpen, -1, DISPLAY_MATRIX_ELEMENTS*
sizeof(
int));
561 #if (defined (DEBUG) && 0)
565 (dnaflag ? n_decode : p_decode)(d-1, d-1, v3);
567 for (
int cnt=0; cnt<deep; cnt++) putchar(
' ');
568 printf(
"master = '%s' (penalties left=%i right=%i)\n", d, st, en);
571 (dnaflag ? n_decode : p_decode)(d-1, d-1, v4);
573 for (cnt=0; cnt<deep; cnt++) putchar(
' ');
574 printf(
"slave = '%s'\n", d);
580 if (last_print<0 && print_ptr>0) {
584 last_print =
set_displ(print_ptr++, -(v3));
611 for (
int j=1; j<=v4; ++j) {
613 if (k<ctrc) { ctrc = k; ctrj = j; }
618 if (last_print<0 && print_ptr>0) {
626 if (ctrj>1)
add(ctrj-1);
628 if (ctrj<v4)
add(v4-ctrj);
647 for (
int j=1; j<=v4; j++) {
658 for (
int i=1; i<=ctri; i++) {
666 for (
int j=1; j<=v4; j++) {
689 # define MHO 1 // X-Offset for second half of matrix-arrays (only used to insert MID-column into dumpMatrix())
690 # define BO 1 // Offset for second half of matrix (cause now the indices i and j are smaller as above)
698 for (
int j=v4-1; j>-1; j--) {
708 for (
int i=v3-1; i>=ctri; i--) {
716 for (
int j=v4-1; j>=0; j--) {
749 for (
int j=(ctri ? 0 : 1); j<=v4; j++) {
754 if (k<ctrc || ((k==ctrc) && (zza[j]!=zzb[j]) && (zzc[j]==zzd[j]))) {
761 for (
int j=v4; j>=(ctri ? 0 : 1); j--) {
779 if (display_matrix) {
780 dumpMatrix(v1, v2, v3, v4, ctri);
791 diff(v1, v2, ctri-1, ctrj, st, 0);
793 if (last_print<0 && print_ptr>0) {
800 diff(v1+ctri+1, v2+ctrj, v3-ctri-1, v4-ctrj, 0, en);
812 for (
int i=1; i<=act_seq_length; ++i) {
814 naa1[j][i] = naa2[j][i]=0;
824 for (
int i=1; i<=
nseqs; ++i) {
827 for (
int j=1; j<=seqlen_array[i]; ++j) {
834 if (seqlen_array[i]>l1) l1=seqlen_array[i];
836 else if (group[i]==2) {
838 for (
int j=1; j<=seqlen_array[i]; ++j) {
845 if (seqlen_array[i]>l2) l2=seqlen_array[i];
852 for (
int i=1; i<=
pos2; ++i) alist[i]=snd_list[i];
853 for (
int n=1; n<=l1; ++n) {
858 t_arr[j] += (weights[i][j]*naa1[i][n]);
865 naas[i][n]=t_arr[i]/k;
881 for (
int i=0; i<to_do; ++i) {
883 result[1][pos]=result[2][pos]=
'*';
889 for (
int j=0; j<=k-1; ++j) {
890 result[2][pos+j]=
'*';
891 result[1][pos+j]=
'-';
896 k = (displ[i]<0) ? displ[i] * -1 : displ[i];
897 for (
int j=0; j<=k-1; ++j) {
898 result[1][pos+j]=
'*';
899 result[2][pos+j]=
'-';
917 for (
int i=1; t[i]; i++) {
918 if (t[i]==c)
return i;
923 static void p_encode(
const unsigned char *seq,
unsigned char *naseq,
int l) {
927 for (
int i=1; i<=l; i++) {
928 int c =
res_index(amino_acid_order, seq[i]);
935 if (!warned && seq[i] !=
'?') {
937 sprintf(buf,
"Illegal character '%c' in sequence data", seq[i]);
950 static void n_encode(
const unsigned char *seq,
unsigned char *naseq,
int l) {
953 for (
int i=1; i<=l; i++) {
954 int c =
res_index(nucleic_acid_order, seq[i]);
957 if (!warned && seq[i] !=
'?') {
959 sprintf(buf,
"Illegal character '%c' in sequence data", seq[i]);
977 const int *gapsBefore1,
986 gaps_before_position = gapsBefore1;
990 is_weight = weighted;
997 for (
int i=1; i<=2 && !
error; i++) {
999 result[i] = (
char*)
ckalloc((max_seq_length+2)*
sizeof(
char), error);
1006 if (dnaflag!=is_dna || is_weight!=weighted) {
1007 error =
"Please call ClustalV_exit() between calls that differ in\n"
1008 "one of the following parameters:\n"
1009 " is_dna, weighted";
1017 #if defined(DEBUG) && 1
1020 memset(&displ[1], 0xff, max_seq_length*
sizeof(displ[1]));
1026 typedef void (*encoder)(
const unsigned char*,
unsigned char*,
int);
1030 seqlen_array[1] = length1;
1032 seqlen_array[2] = length2;
1035 int alignedLength =
add_ggaps(max_seq_length);
1037 result[1][alignedLength+1] = 0;
1038 result[2][alignedLength+1] = 0;
1042 reslen = alignedLength;
1053 for (
int i=1; i<=2; i++) {
1069 static arb_test::match_expectation clustal_aligns(
const char *i1,
const char *i2,
const char *expected1,
const char *expected2,
int expected_score) {
1070 size_t l1 = strlen(i1);
1071 size_t l2 = strlen(i2);
1073 const char *result1 =
NULp;
1074 const char *result2 =
NULp;
1078 int gaps_size = l1+1;
1079 int no_gaps_before1[gaps_size];
1080 memset(no_gaps_before1, 0, gaps_size*
sizeof(*no_gaps_before1));
1100 return all().ofgroup(expected);
1103 #define TEST_CLUSTAL_ALIGNS(i1,i2,o1,o2,score) TEST_EXPECTATION(clustal_aligns(i1,i2,o1,o2,score))
1105 void TEST_clustalV() {
1106 TEST_CLUSTAL_ALIGNS(
"ACGTTGCAACGT",
1112 TEST_CLUSTAL_ALIGNS(
"ACGTTAACGT",
1118 TEST_CLUSTAL_ALIGNS(
"ACGTTGCAACGT",
1124 TEST_CLUSTAL_ALIGNS(
"ACGTTGCAACGT",
1130 TEST_CLUSTAL_ALIGNS(
"ACGTTGCAACGT",
1136 TEST_CLUSTAL_ALIGNS(
"ACGTTGCAACGT",
1142 TEST_CLUSTAL_ALIGNS(
"ACGTTGCAACGT",
1151 #endif // UNIT_TESTS
static int getPenalty(char c1, char c2)
void * ckalloc(size_t bytes, ARB_ERROR &error)
int decr_displ(int offset, int value)
static void do_align(int &score, long act_seq_length)
match_expectation doesnt_report_error(const char *error)
static const char * nucleic_acid_order
int master_gap_open(int beforePosition)
#define DEFAULT_PROBABLY_MUTATION
int master_gapAtWithOpenPenalty(int atPosition, int length, int penalty)
static void make_pamo(int nv)
static int res_index(const char *t, char c)
static int add_ggaps(long)
#define MAX_GAP_EXTEND_DISCOUNT
#define MASTER_GAP_EXTEND
#define IF_MATRIX_DUMP(xxx)
static int diff(int v1, int v2, int v3, int v4, int st, int en)
int calc_weight(int iat, int jat, int v1, int v2)
#define MAX_GAP_OPEN_DISCOUNT
static void p_encode(const unsigned char *seq, unsigned char *naseq, int l)
static void exit_show_pair()
static int weights[MAX_BASETYPES][MAX_BASETYPES]
static const char * nucleic_maybe_U
static void error(const char *msg)
static int snd_list[MAXN+1]
expectation_group & add(const expectation &e)
int set_displ(int offset, int value)
ARB_ERROR ClustalV_align(int is_dna, int weighted, const char *seq1, int length1, const char *seq2, int length2, const int *gapsBefore1, int max_seq_length, const char *&res1, const char *&res2, int &reslen, int &score)
int master_gapAt(int atPosition, int length)
static const char * nucleic_maybe_T
#define DEFAULT_IMPROBABLY_MUTATION
#define DEFAULT_GAP_EXTEND
static const char * nucleic_maybe[6]
int baseMatch(char c1, char c2)
static ARB_ERROR init_myers(long max_seq_length)
static int pamo[(MAX_BASETYPES-1)*MAX_BASETYPES/2]
static int pam[MAX_BASETYPES][MAX_BASETYPES]
static ARB_ERROR init_show_pair(long max_seq_length)
static bool module_initialized
int slave_gapAtWithOpenPenalty(int atPosition, int length, int penalty)
static void n_encode(const unsigned char *seq, unsigned char *naseq, int l)
static int baseCmp(unsigned char c1, unsigned char c2)
fa_assert(chars< MESSAGE_BUFFERSIZE)
static const int * gaps_before_position
static const char * nucleic_maybe_A
void aw_message(const char *msg)
static int seqlen_array[MAXN+1]
ARB_ERROR MAXLENtooSmall()
char encode(const char bases[], GB_alignment_type aliType)
static const char * nucleic_maybe_G
int master_gap_extend(int beforePosition)
char * strndup(const char *str, int len)
static const char * nucleic_maybe_C
int slave_gapAt(int atPosition, int length)
int slave_gap_extend(int)
static unsigned char * seq_array[MAXN+1]
static const char * amino_acid_order
static int fst_list[MAXN+1]
GB_write_int const char s