ARB
ED4_protein_2nd_structure.cxx
Go to the documentation of this file.
1 // =============================================================== //
2 // //
3 // File : ED4_protein_2nd_structure.cxx //
4 // Purpose : //
5 // //
6 // Institute of Microbiology (Technical University Munich) //
7 // http://www.arb-home.de/ //
8 // //
9 // =============================================================== //
10 
11 
20 #include "ed4_class.hxx"
21 #include "ed4_awars.hxx"
22 
23 #include <aw_awar.hxx>
24 #include <aw_msg.hxx>
25 #include <aw_root.hxx>
26 #include "arbdbt.h"
27 
28 #include <iostream>
29 #include <awt_config_manager.hxx>
30 
31 #define e4_assert(bed) arb_assert(bed)
32 
33 using namespace std;
34 
35 // --------------------------------------------------------------------------------
36 // exported data
37 
40  { "Perfect_match", STRUCT_PERFECT_MATCH },
41  { "Good_match", STRUCT_GOOD_MATCH },
42  { "Medium_match", STRUCT_MEDIUM_MATCH },
43  { "Bad_match", STRUCT_BAD_MATCH },
44  { "No_match", STRUCT_NO_MATCH },
45  { "Unknown_match", STRUCT_UNKNOWN },
47 };
48 
51  ARB_strdup(" "), // STRUCT_PERFECT_MATCH
52  ARB_strdup("-"), // STRUCT_GOOD_MATCH
53  ARB_strdup("~"), // STRUCT_MEDIUM_MATCH
54  ARB_strdup("+"), // STRUCT_BAD_MATCH
55  ARB_strdup("#"), // STRUCT_NO_MATCH
56  ARB_strdup("?") // STRUCT_UNKNOWN
57 };
58 
61  ARB_strdup("HH GG II TT EE BB SS -- -. .."), // STRUCT_PERFECT_MATCH
62  ARB_strdup("HG HI HS EB ES TS H- G- I- T- E- B- S-"), // STRUCT_GOOD_MATCH
63  ARB_strdup("HT GT IT"), // STRUCT_MEDIUM_MATCH
64  ARB_strdup("ET BT"), // STRUCT_BAD_MATCH
65  ARB_strdup("EH BH EG EI"), // STRUCT_NO_MATCH
66  ARB_strdup("") // STRUCT_UNKNOWN
67 };
68 
69 static struct pfold_mem_handler {
71  for (int i = 0; i<PFOLD_PAIRS; ++i) {
72  freenull(pfold_pairs[i]);
73  freenull(pfold_pair_chars[i]);
74  }
75  }
77 
78 // --------------------------------------------------------------------------------
79 
86 static const char *amino_acids = "ARDNCEQGHILKMFPSTWYV";
87 
98 static int *char2AA = NULp;
99 
109 static char structure_chars[3] = { 'H', 'E', 'T' };
110 
112 static const char *structure_breaker[2] = {
113  "NYPG",
114  "PDESGK"
115 };
116 
118 static const char *structure_indifferent[2] = {
119  "RTSC",
120  "RNHA"
121 };
122 
125  { "Secondary Structure <-> Secondary Structure", SECSTRUCT_SECSTRUCT },
126  { "Secondary Structure <-> Sequence", SECSTRUCT_SEQUENCE },
127  { "Secondary Structure <-> Sequence (Full Prediction)", SECSTRUCT_SEQUENCE_PREDICT },
129 };
130 
131 static double max_former_value[3] = { 1.42, 1.62, 156 };
132 static double min_former_value[3] = { 0.0, 0.0, 47 };
133 static double max_breaker_value[3] = { 1.21, 2.03, 0.0 };
134 
135 // --------------------------------------------------------------------------------
136 
137 // TODO: is there a way to prevent doxygen from stripping the comments from the table?
138 // I simply added the parameter table as verbatim environment to show the comments in
139 // the documentation.
181 static double cf_parameters[20][4] = {
182  /* Helix Former Strand Former Helix Breaker Strand Breaker Amino
183  Value Value Value Value Acid
184  ----------------------------------------------------------------------- */
185  { 1.34, 0.00, 0.00, 0.00 }, // A
186  { 0.00, 0.00, 0.00, 0.00 }, // R
187  { 0.50, 0.00, 0.00, 1.39 }, // D
188  { 0.00, 0.00, 1.03, 0.00 }, // N
189  { 0.00, 1.13, 0.00, 0.00 }, // C
190  { 1.42, 0.00, 0.00, 2.03 }, // E
191  { 1.05, 1.05, 0.00, 0.00 }, // Q
192  { 0.00, 0.00, 1.21, 1.00 }, // G
193  { 0.50, 0.00, 0.00, 0.00 }, // H
194  { 1.02, 1.52, 0.00, 0.00 }, // I
195  { 1.14, 1.24, 0.00, 0.00 }, // L
196  { 1.09, 0.00, 0.00, 1.01 }, // K
197  { 1.37, 1.00, 0.00, 0.00 }, // M
198  { 1.07, 1.31, 0.00, 0.00 }, // F
199  { 0.00, 0.00, 1.21, 1.36 }, // P
200  { 0.00, 0.00, 0.00, 1.00 }, // S
201  { 0.00, 1.13, 0.00, 0.00 }, // T
202  { 1.02, 1.30, 0.00, 0.00 }, // W
203  { 0.00, 1.40, 1.00, 0.00 }, // Y
204  { 1.00, 1.62, 0.00, 0.00 } }; // V
205 
246 static double cf_parameters_norm[20][7] = {
247  /* P(a) P(b) P(turn) f(i) f(i+1) f(i+2) f(i+3) Amino Acid
248  -------------------------------------------------------------------- */
249  { 142, 83, 66, 0.060, 0.076, 0.035, 0.058 }, // A
250  { 98, 93, 95, 0.070, 0.106, 0.099, 0.085 }, // R
251  { 101, 54, 146, 0.147, 0.110, 0.179, 0.081 }, // D
252  { 67, 89, 156, 0.161, 0.083, 0.191, 0.091 }, // N
253  { 70, 119, 119, 0.149, 0.050, 0.117, 0.128 }, // C
254  { 151, 37, 74, 0.056, 0.060, 0.077, 0.064 }, // E
255  { 111, 110, 98, 0.074, 0.098, 0.037, 0.098 }, // Q
256  { 57, 75, 156, 0.102, 0.085, 0.190, 0.152 }, // G
257  { 100, 87, 95, 0.140, 0.047, 0.093, 0.054 }, // H
258  { 108, 160, 47, 0.043, 0.034, 0.013, 0.056 }, // I
259  { 121, 130, 59, 0.061, 0.025, 0.036, 0.070 }, // L
260  { 116, 74, 101, 0.055, 0.115, 0.072, 0.095 }, // K
261  { 145, 105, 60, 0.068, 0.082, 0.014, 0.055 }, // M
262  { 113, 138, 60, 0.059, 0.041, 0.065, 0.065 }, // F
263  { 57, 55, 152, 0.102, 0.301, 0.034, 0.068 }, // P
264  { 77, 75, 143, 0.120, 0.139, 0.125, 0.106 }, // S
265  { 83, 119, 96, 0.086, 0.108, 0.065, 0.079 }, // T
266  { 108, 137, 96, 0.077, 0.013, 0.064, 0.167 }, // W
267  { 69, 147, 114, 0.082, 0.065, 0.114, 0.125 }, // Y
268  { 106, 170, 50, 0.062, 0.048, 0.028, 0.053 } }; // V
269 
270 // --------------------------------------------------------------------------------
271 
281 inline int ED4_pfold_round_sym(double d) {
282  return int(d + .5);
283 }
284 
285 
295 static void ED4_pfold_init_statics() {
296  // specify the characters used for amino acid one letter code
297  if (!char2AA) {
298  char2AA = new int [256];
299  for (int i = 0; i < 256; i++) {
300  char2AA[i] = -1;
301  }
302  for (int i = 0; amino_acids[i]; i++) {
303  char2AA[(unsigned char)amino_acids[i]] = i;
304  }
305  }
306 }
307 
308 
324 static void ED4_pfold_find_nucleation_sites(const unsigned char *sequence, char *structure, int length, const PFOLD_STRUCTURE s) {
325 #ifdef SHOW_PROGRESS
326  cout << endl << "Searching for nucleation sites:" << endl;
327 #endif
328  e4_assert(s == ALPHA_HELIX || s == BETA_SHEET); // incorrect value for s
329  e4_assert(char2AA); // char2AA not initialized; ED4_pfold_init_statics() failed or hasn't been called yet
330 
331  char *gap_chars = ED4_ROOT->aw_root->awar(ED4_AWAR_GAP_CHARS)->read_string(); // gap characters
332  int windowSize = (s == ALPHA_HELIX ? 6 : 5); // window size used for finding nucleation sites
333  double sumOfFormVal = 0, sumOfBreakVal = 0; // sum of former resp. breaker values in window
334  int pos; // current position in sequence
335  int count; // number of amino acids found in window
336 
337  for (int i = 0; i < ((length + 1) - windowSize); i++) {
338  int aa = 0; // amino acid
339 
340  pos = i;
341  for (count = 0; count < windowSize; count++) {
342  // skip gaps
343  while (pos < ((length + 1) - windowSize) &&
344  strchr(gap_chars, sequence[pos + count])) {
345  pos++;
346  }
347  aa = char2AA[sequence[pos + count]];
348  if (aa == -1) break; // unknown character found
349 
350  // compute former and breaker values
351  sumOfFormVal += cf_parameters[aa][s];
352  sumOfBreakVal += cf_parameters[aa][s+2];
353  }
354 
355  // assign sequence and save start and end of nucleation site for later extension
356  if ((sumOfFormVal > (windowSize - 2)) && (sumOfBreakVal < 2)) {
357  for (int j = i; j < (pos + count); j++) {
358  if (char2AA[sequence[j]] != -1) structure[j] = structure_chars[s];
359  }
360  }
361  if (aa == -1) i = pos + count; // skip unknown character
362  sumOfFormVal = 0, sumOfBreakVal = 0;
363  }
364 
365  free(gap_chars);
366 #ifdef SHOW_PROGRESS
367  cout << structure << endl;
368 #endif
369 }
370 
371 
386 static void ED4_pfold_extend_nucleation_sites(const unsigned char *sequence, char *structure, int length, const PFOLD_STRUCTURE s) {
387 #ifdef SHOW_PROGRESS
388  cout << endl << "Extending nucleation sites:" << endl;
389 #endif
390  e4_assert(s == ALPHA_HELIX || s == BETA_SHEET); // incorrect value for s
391  e4_assert(char2AA); // char2AA not initialized; ED4_pfold_init_statics() failed or hasn't been called yet
392 
393  bool break_structure = false; // break the current structure
394  int start = 0, end = 0; // start and end of nucleation site
395  int neighbour = 0; // neighbour of start or end
396 
397  char *gap_chars = ED4_ROOT->aw_root->awar(ED4_AWAR_GAP_CHARS)->read_string(); // gap characters
398 
399  // find nucleation sites and extend them in both directions (for whole sequence)
400  for (int indStruct = 0; indStruct < length; indStruct++) {
401 
402  // search start and end of nucleated region
403  while (indStruct < length &&
404  ((structure[indStruct] == ' ') || strchr(gap_chars, sequence[indStruct]))
405  ) indStruct++;
406 
407  if (indStruct >= length) break;
408  // get next amino acid that is not included in nucleation site
409  start = indStruct - 1;
410  while (indStruct < length &&
411  (structure[indStruct] != ' ' || strchr(gap_chars, sequence[indStruct]))) {
412  indStruct++;
413  }
414  // get next amino acid that is not included in nucleation site
415  end = indStruct;
416 
417  // extend nucleated region in both directions
418  // left side:
419  while (start > 1 && strchr(gap_chars, sequence[start])) {
420  start--; // skip gaps
421  }
422  // break if no amino acid is found
423  if (start >= 0) break_structure = (char2AA[sequence[start]] == -1);
424  while (!break_structure && (start > 1) && (structure[start] == ' ')) {
425  // break if absolute breaker (P or E) is found
426  break_structure = (sequence[start] == 'P');
427  if (s == BETA_SHEET) break_structure |= (sequence[start] == 'E');
428  if (break_structure) break;
429  // check for breaker at current position
430  break_structure = strchr(structure_breaker[s], sequence[start]);
431  neighbour = start - 1; // get neighbour
432  while (neighbour > 0 && strchr(gap_chars, sequence[neighbour])) {
433  neighbour--; // skip gaps
434  }
435  // break if out of bounds or no amino acid is found
436  if (neighbour <= 0 || char2AA[sequence[neighbour]] == -1) {
437  break;
438  }
439  // break if another breaker or indifferent amino acid is found
440  break_structure &=
441  strchr(structure_breaker[s], sequence[neighbour]) ||
442  strchr(structure_indifferent[s], sequence[neighbour]);
443  if (!break_structure) {
444  structure[start] = structure_chars[s];
445  }
446  start = neighbour; // continue with neighbour
447  }
448 
449  // right side:
450  while (end < (length - 2) && strchr(gap_chars, sequence[end])) {
451  end++; // skip gaps
452  }
453  // break if no amino acid is found
454  if (end <= (length - 1)) break_structure = (char2AA[sequence[end]] == -1);
455  while (!break_structure && (end < (length - 2))) {
456  // break if absolute breaker (P or E) is found
457  break_structure = (sequence[end] == 'P');
458  if (s == BETA_SHEET) break_structure |= (sequence[end] == 'E');
459  if (break_structure) break;
460  // check for breaker at current position
461  break_structure = strchr(structure_breaker[s], sequence[end]);
462  neighbour = end + 1; // get neighbour
463  while (neighbour < (length - 2) && strchr(gap_chars, sequence[neighbour])) {
464  neighbour++; // skip gaps
465  }
466  // break if out of bounds or no amino acid is found
467  if (neighbour >= (length - 1) || char2AA[sequence[neighbour]] == -1) {
468  end = neighbour;
469  break;
470  }
471  // break if another breaker or indifferent amino acid is found
472  break_structure &=
473  strchr(structure_breaker[s], sequence[neighbour]) ||
474  strchr(structure_indifferent[s], sequence[neighbour]);
475  if (!break_structure) {
476  structure[end] = structure_chars[s];
477  }
478  end = neighbour; // continue with neighbour
479  }
480  indStruct = end; // continue with end
481  }
482 
483  free(gap_chars);
484 #ifdef SHOW_PROGRESS
485  cout << structure << endl;
486 #endif
487 }
488 
489 
504 static void ED4_pfold_find_turns(const unsigned char *sequence, char *structure, int length) {
505 #ifdef SHOW_PROGRESS
506  cout << endl << "Searching for beta-turns: " << endl;
507 #endif
508  e4_assert(char2AA); // char2AA not initialized; ED4_pfold_init_statics() failed or hasn't been called yet
509 
510  char *gap_chars = ED4_ROOT->aw_root->awar(ED4_AWAR_GAP_CHARS)->read_string(); // gap characters
511  const int windowSize = 4; // window size
512  double P_a = 0, P_b = 0, P_turn = 0; // former values for helix, sheet and beta-turn
513  double p_t = 1; // probability for beta-turn
514  int pos; // position in sequence
515  int count; // position in window
516  int aa; // amino acid
517 
518  for (int i = 0; i < ((length + 1) - windowSize); i++) {
519  pos = i;
520  for (count = 0; count < windowSize; count++) {
521  // skip gaps
522  while (pos < ((length + 1) - windowSize) &&
523  strchr(gap_chars, sequence[pos + count])) {
524  pos++;
525  }
526  aa = char2AA[sequence[pos + count]];
527  if (aa == -1) break; // unknown character found
528 
529  // compute former values and turn probability
530  P_a += cf_parameters_norm[aa][0];
531  P_b += cf_parameters_norm[aa][1];
532  P_turn += cf_parameters_norm[aa][2];
533  p_t *= cf_parameters_norm[aa][3 + count];
534  }
535  if (count != 0) {
536  P_a /= count;
537  P_b /= count;
538  P_turn /= count;
539  if ((p_t > 0.000075) && (P_turn > 100) && (P_turn > P_a) && (P_turn > P_b)) {
540  for (int j = i; j < (pos + count); j++) {
541  if (char2AA[sequence[j]] != -1) structure[j] = structure_chars[BETA_TURN];
542  }
543  }
544  }
545  if (aa == -1) i = pos + count; // skip unknown character
546  p_t = 1, P_a = 0, P_b = 0, P_turn = 0;
547  }
548 
549  free(gap_chars);
550 #ifdef SHOW_PROGRESS
551  cout << structure << endl;
552 #endif
553 }
554 
555 
573 static void ED4_pfold_resolve_overlaps(const unsigned char *sequence, char *structures[4], int length) {
574 #ifdef SHOW_PROGRESS
575  cout << endl << "Resolving overlaps: " << endl;
576 #endif
577  e4_assert(char2AA); // char2AA not initialized; ED4_pfold_init_statics() failed or hasn't been called yet
578 
579  int start = -1; // start of overlap
580  int end = -1; // end of overlap
581  double P_a = 0; // sum of former values for alpha-helix in overlapping regions
582  double P_b = 0; // sum of former values for beta-sheet in overlapping regions
583  PFOLD_STRUCTURE s; // structure with the highest former values
584  char *gap_chars = ED4_ROOT->aw_root->awar(ED4_AWAR_GAP_CHARS)->read_string(); // gap characters
585 
586  // scan structures for overlaps
587  for (int pos = 0; pos < length; pos++) {
588 
589  // if beta-turn is found at position pos -> summary is beta-turn
590  if (structures[BETA_TURN][pos] != ' ') {
591  structures[STRUCTURE_SUMMARY][pos] = structure_chars[BETA_TURN];
592 
593  // if helix and sheet are overlapping and no beta-turn is found -> check which structure has the highest sum of former values
594  }
595  else if ((structures[ALPHA_HELIX][pos] != ' ') && (structures[BETA_SHEET][pos] != ' ')) {
596 
597  // search start and end of overlap (as long as no beta-turn is found)
598  start = pos;
599  end = pos;
600  while (structures[ALPHA_HELIX][end] != ' ' && structures[BETA_SHEET][end] != ' ' &&
601  structures[BETA_TURN][end] == ' ') {
602  end++;
603  }
604 
605  // calculate P_a and P_b for overlap
606  for (int i = start; i < end; i++) {
607  // skip gaps
608  while (i < end && strchr(gap_chars, sequence[i])) {
609  i++;
610  }
611  int aa = char2AA[sequence[i]];
612  if (aa != -1) {
613  P_a += cf_parameters[aa][ALPHA_HELIX];
614  P_b += cf_parameters[aa][BETA_SHEET];
615  }
616  }
617 
618  // check which structure is more likely and set s appropriately
619  s = (P_a > P_b) ? ALPHA_HELIX : BETA_SHEET;
620 
621  // set structure for overlapping region
622  for (int i = start; i < end; i++) {
623  structures[STRUCTURE_SUMMARY][i] = structure_chars[s];
624  }
625 
626  // set variables for next pass of loop resp. end of loop
627  P_a = 0, P_b = 0;
628  pos = end - 1;
629 
630  // if helix and sheet are not overlapping and no beta-turn is found -> set structure accordingly
631  }
632  else {
633  // summary at position pos is helix resp. sheet
634  if (structures[ALPHA_HELIX][pos] != ' ') {
635  structures[STRUCTURE_SUMMARY][pos] = structure_chars[ALPHA_HELIX];
636  }
637  else if (structures[BETA_SHEET][pos] != ' ') {
638  structures[STRUCTURE_SUMMARY][pos] = structure_chars[BETA_SHEET];
639  }
640  }
641  }
642 
643  free(gap_chars);
644 #ifdef SHOW_PROGRESS
645  cout << structures[summary] << endl;
646 #endif
647 }
648 
649 
668 static GB_ERROR ED4_pfold_predict_structure(const unsigned char *sequence, char *structures[4], int length) {
669 #ifdef SHOW_PROGRESS
670  cout << endl << "Predicting secondary structure for sequence:" << endl << sequence << endl;
671 #endif
672  GB_ERROR error = NULp;
673  e4_assert((int)strlen((const char *)sequence) == length);
674 
675  // init memory
677  e4_assert(char2AA);
678 
679  // predict structure
680  ED4_pfold_find_nucleation_sites(sequence, structures[ALPHA_HELIX], length, ALPHA_HELIX);
681  ED4_pfold_find_nucleation_sites(sequence, structures[BETA_SHEET], length, BETA_SHEET);
682  ED4_pfold_extend_nucleation_sites(sequence, structures[ALPHA_HELIX], length, ALPHA_HELIX);
683  ED4_pfold_extend_nucleation_sites(sequence, structures[BETA_SHEET], length, BETA_SHEET);
684  ED4_pfold_find_turns(sequence, structures[BETA_TURN], length);
685  ED4_pfold_resolve_overlaps(sequence, structures, length);
686 
687  return error;
688 }
689 
690 GB_ERROR ED4_pfold_calculate_secstruct_match(const unsigned char *structure_sai, const unsigned char *structure_cmp, const int start, const int end, char *result_buffer, PFOLD_MATCH_METHOD match_method) {
691  GB_ERROR error = NULp;
692  e4_assert(structure_sai);
693  e4_assert(structure_cmp);
694  e4_assert(start >= 0);
695  e4_assert(result_buffer);
696  e4_assert(match_method >= 0 && match_method < PFOLD_MATCH_METHOD_COUNT);
698  e4_assert(char2AA);
699 
700  e4_assert(end >= start);
701 
702  size_t end_minus_start = size_t(end-start); // @@@ use this
703 
704  size_t length = strlen((const char *)structure_sai);
705  size_t match_end = std::min(std::min(end_minus_start, length), strlen((const char *)structure_cmp));
706 
707  enum { BEND = 3, NOSTRUCT = 4 };
708  char *struct_chars[] = {
709  ARB_strdup("HGI"), // helical structures (enum ALPHA_HELIX)
710  ARB_strdup("EB"), // sheet-like structures (enum BETA_SHEET)
711  ARB_strdup("T"), // beta-turn (enum BETA_TURN)
712  ARB_strdup("S"), // bends (enum BEND)
713  ARB_strdup("") // no structure (enum NOSTRUCT)
714  };
715 
716  // init awars
717  AW_root *awr = ED4_ROOT->aw_root;
718  char *gap_chars = awr->awar(ED4_AWAR_GAP_CHARS)->read_string();
719  char *pairs[PFOLD_MATCH_TYPE_COUNT] = { NULp };
720  char *pair_chars[PFOLD_MATCH_TYPE_COUNT] = { NULp };
721  char *pair_chars_2 = awr->awar(PFOLD_AWAR_SYMBOL_TEMPLATE_2)->read_string();
722  char awar[256];
723 
724  for (int i = 0; pfold_match_type_awars[i].name; i++) {
725  sprintf(awar, PFOLD_AWAR_PAIR_TEMPLATE, pfold_match_type_awars[i].name);
726  pairs[i] = awr->awar(awar)->read_string();
727  sprintf(awar, PFOLD_AWAR_SYMBOL_TEMPLATE, pfold_match_type_awars[i].name);
728  pair_chars[i] = awr->awar(awar)->read_string();
729  }
730 
731  int struct_start = start;
732  int struct_end = start;
733  size_t count = 0;
734  int current_struct = 4;
735  int aa = -1;
736  double prob = 0;
737 
738  // TODO: move this check to callback for the corresponding field?
739  if (strlen(pair_chars_2) != 10) {
740  error = GB_export_error("You have to define 10 match symbols.");
741  }
742 
743  if (!error) {
744  switch (match_method) {
745 
746  case SECSTRUCT_SECSTRUCT:
747  // TODO: one could try to find out, if structure_cmp is really a secondary structure and not a sequence (define awar for allowed symbols in secondary structure)
748  for (count = 0; count < match_end; count++) {
749  result_buffer[count] = *pair_chars[STRUCT_UNKNOWN];
750  for (int n_pt = 0; n_pt < PFOLD_MATCH_TYPE_COUNT; n_pt++) {
751  int len = strlen(pairs[n_pt])-1;
752  char *p = pairs[n_pt];
753  for (int j = 0; j < len; j += 3) {
754  if ((p[j] == structure_sai[count + start] && p[j+1] == structure_cmp[count + start]) ||
755  (p[j] == structure_cmp[count + start] && p[j+1] == structure_sai[count + start])) {
756  result_buffer[count] = *pair_chars[n_pt];
757  n_pt = PFOLD_MATCH_TYPE_COUNT; // stop searching the pair types
758  break; // stop searching the pairs array
759  }
760  }
761  }
762  }
763 
764  // fill the remaining buffer with spaces
765  while (count <= end_minus_start) { // LOOP_VECTORIZED[!<720]
766  result_buffer[count] = ' ';
767  count++;
768  }
769  break;
770 
771  case SECSTRUCT_SEQUENCE:
772  // clear result buffer
773  for (size_t i = 0; i <= end_minus_start; i++) result_buffer[i] = ' '; // LOOP_VECTORIZED[!<720]
774 
775  // skip gaps
776  while (structure_sai[struct_start] != '\0' && structure_cmp[struct_start] != '\0' &&
777  strchr(gap_chars, structure_sai[struct_start]) &&
778  strchr(gap_chars, structure_cmp[struct_start])) {
779  struct_start++;
780  }
781  if (structure_sai[struct_start] == '\0' || structure_cmp[struct_start] == '\0') break;
782 
783  // check structure at the first displayed position and find out where it starts
784  for (current_struct = 0; current_struct < 4 && !strchr(struct_chars[current_struct], structure_sai[struct_start]); current_struct++) {
785  ;
786  }
787  if (current_struct != BEND && current_struct != NOSTRUCT) {
788  struct_start--; // check structure left of start
789  while (struct_start >= 0) {
790  // skip gaps
791  while (struct_start > 0 &&
792  strchr(gap_chars, structure_sai[struct_start]) &&
793  strchr(gap_chars, structure_cmp[struct_start])) {
794  struct_start--;
795  }
796  aa = char2AA[structure_cmp[struct_start]];
797  if (struct_start == 0 && aa == -1) { // nothing was found
798  break;
799  }
800  else if (strchr(struct_chars[current_struct], structure_sai[struct_start]) && aa != -1) {
801  prob += cf_former(aa, current_struct) - cf_breaker(aa, current_struct); // sum up probabilities
802  struct_start--;
803  count++;
804  }
805  else {
806  break;
807  }
808  }
809  }
810 
811  // parse structures
812  struct_start = start;
813  // skip gaps
814  while (structure_sai[struct_start] != '\0' && structure_cmp[struct_start] != '\0' &&
815  strchr(gap_chars, structure_sai[struct_start]) &&
816  strchr(gap_chars, structure_cmp[struct_start])) {
817  struct_start++;
818  }
819  if (structure_sai[struct_start] == '\0' || structure_cmp[struct_start] == '\0') break;
820  struct_end = struct_start;
821  while (struct_end < end) {
822  aa = char2AA[structure_cmp[struct_end]];
823  if (current_struct == NOSTRUCT) { // no structure found -> move on
824  struct_end++;
825  }
826  else if (aa == -1) { // structure found but no corresponding amino acid -> doesn't fit at all
827  result_buffer[struct_end - start] = pair_chars_2[0];
828  struct_end++;
829  }
830  else if (current_struct == BEND) { // bend found -> fits perfectly everywhere
831  result_buffer[struct_end - start] = pair_chars_2[9];
832  struct_end++;
833  }
834  else { // helix, sheet or beta-turn found -> while structure doesn't change: sum up probabilities
835  while (structure_sai[struct_end] != '\0') {
836  // skip gaps
837  while (strchr(gap_chars, structure_sai[struct_end]) &&
838  strchr(gap_chars, structure_cmp[struct_end]) &&
839  structure_sai[struct_end] != '\0' && structure_cmp[struct_end] != '\0') {
840  struct_end++;
841  }
842  aa = char2AA[structure_cmp[struct_end]];
843  if (structure_sai[struct_end] != '\0' && structure_cmp[struct_end] != '\0' &&
844  strchr(struct_chars[current_struct], structure_sai[struct_end]) && aa != -1) {
845  prob += cf_former(aa, current_struct) - cf_breaker(aa, current_struct); // sum up probabilities
846  struct_end++;
847  count++;
848  }
849  else {
850  break;
851  }
852  }
853 
854  if (count != 0) {
855  // compute average and normalize probability
856  prob /= count;
857  prob = (prob + max_breaker_value[current_struct] - min_former_value[current_struct]) / (max_breaker_value[current_struct] + max_former_value[current_struct] - min_former_value[current_struct]);
858 
859 #if 0 // code w/o effect
860  // map to match characters and store in result_buffer
861  int prob_normalized = ED4_pfold_round_sym(prob * 9);
862  // e4_assert(prob_normalized >= 0 && prob_normalized <= 9); // if this happens check if normalization is correct or some undefined characters mess everything up
863  char prob_symbol = *pair_chars[STRUCT_UNKNOWN];
864  if (prob_normalized >= 0 && prob_normalized <= 9) {
865  prob_symbol = pair_chars_2[prob_normalized];
866  }
867 #endif
868  }
869  }
870 
871  // find next structure type
872  if (structure_sai[struct_end] == '\0' || structure_cmp[struct_end] == '\0') {
873  break;
874  }
875  else {
876  prob = 0;
877  count = 0;
878  struct_start = struct_end;
879  for (current_struct = 0; current_struct < 4 && !strchr(struct_chars[current_struct], structure_sai[struct_start]); current_struct++) {
880  ;
881  }
882  }
883  }
884  break;
885 
887  // predict structures from structure_cmp and compare with structure_sai
888  char *structures[4];
889  for (int i = 0; i < 4 && !error; i++) {
890  structures[i] = new char [length + 1];
891  if (!structures[i]) {
892  error = "Out of memory";
893  }
894  else {
895  for (size_t j = 0; j < length; j++) { // LOOP_VECTORIZED[!<810]
896  structures[i][j] = ' ';
897  }
898  structures[i][length] = '\0';
899  }
900  }
901  if (!error) error = ED4_pfold_predict_structure(structure_cmp, structures, length);
902  if (!error) {
903  for (count = 0; count < match_end; count++) {
904  result_buffer[count] = *pair_chars[STRUCT_UNKNOWN];
905  if (!strchr(gap_chars, structure_sai[count + start]) && strchr(gap_chars, structure_cmp[count + start])) {
906  result_buffer[count] = *pair_chars[STRUCT_NO_MATCH];
907  } else if (strchr(gap_chars, structure_sai[count + start]) ||
908  (structures[ALPHA_HELIX][count + start] == ' ' && structures[BETA_SHEET][count + start] == ' ' && structures[BETA_TURN][count + start] == ' ')) {
909  result_buffer[count] = *pair_chars[STRUCT_PERFECT_MATCH];
910  }
911  else {
912  // search for good match first
913  // if found: stop searching
914  // otherwise: continue searching for a less good match
915  for (int n_pt = 0; n_pt < PFOLD_MATCH_TYPE_COUNT; n_pt++) {
916  int len = strlen(pairs[n_pt])-1;
917  char *p = pairs[n_pt];
918  for (int n_struct = 0; n_struct < 3; n_struct++) {
919  for (int j = 0; j < len; j += 3) {
920  if ((p[j] == structures[n_struct][count + start] && p[j+1] == structure_sai[count + start]) ||
921  (p[j] == structure_sai[count + start] && p[j+1] == structures[n_struct][count + start])) {
922  result_buffer[count] = *pair_chars[n_pt];
923  n_struct = 3; // stop searching the structures
924  n_pt = PFOLD_MATCH_TYPE_COUNT; // stop searching the pair types
925  break; // stop searching the pairs array
926  }
927  }
928  }
929  }
930  }
931  }
932  // fill the remaining buffer with spaces
933  while (count <= end_minus_start) { // LOOP_VECTORIZED[!<720]
934  result_buffer[count] = ' ';
935  count++;
936  }
937  }
938  // free buffer
939  for (int i = 0; i < 4; i++) {
940  if (structures[i]) {
941  delete structures[i];
942  structures[i] = NULp;
943  }
944  }
945  break;
946 
947  default:
948  e4_assert(0); // function called with invalid argument for 'match_method'
949  break;
950  }
951  }
952 
953  free(gap_chars);
954  free(pair_chars_2);
955  for (int i = 0; pfold_match_type_awars[i].name; i++) {
956  free(pairs[i]);
957  free(pair_chars[i]);
958  }
959  if (error) {
960  // clear result buffer
961  for (size_t i = 0; i <= end_minus_start; i++) result_buffer[i] = ' '; // LOOP_VECTORIZED[!<720]
962  }
963  return error;
964 }
965 
966 
967 GB_ERROR ED4_pfold_set_SAI(char **protstruct, GBDATA *gb_main, const char *alignment_name, long *protstruct_len) { // @@@ use references for protstruct + protstruct_len
968  GB_ERROR error = NULp;
969  GB_transaction ta(gb_main);
970  AW_root *aw_root = ED4_ROOT->aw_root;
971  char *SAI_name = aw_root->awar(PFOLD_AWAR_SELECTED_SAI)->read_string();
972  GBDATA *gb_protstruct = GBT_find_SAI(gb_main, SAI_name);
973 
974  freenull(*protstruct);
975 
976  if (gb_protstruct) {
977  GBDATA *gb_data = GBT_find_sequence(gb_protstruct, alignment_name);
978  if (gb_data) *protstruct = GB_read_string(gb_data); // @@@ NOT_ALL_SAI_HAVE_DATA
979  }
980 
981  if (*protstruct) {
982  if (protstruct_len) *protstruct_len = (long)strlen(*protstruct);
983  }
984  else {
985  if (protstruct_len) protstruct_len = NULp;
986  if (aw_root->awar(PFOLD_AWAR_ENABLE)->read_int()) {
987  error = GBS_global_string("SAI \"%s\" does not exist.\nDisabled protein structure display!", SAI_name);
988  aw_root->awar(PFOLD_AWAR_ENABLE)->write_int(0);
989  }
990  }
991 
992  free(SAI_name);
993  return error;
994 }
995 
1012  e4_assert(aww);
1013  e4_assert(oms);
1014  char *selected_sai = ED4_ROOT->aw_root->awar(PFOLD_AWAR_SELECTED_SAI)->read_string();
1015  char *sai_filter = ED4_ROOT->aw_root->awar(PFOLD_AWAR_SAI_FILTER)->read_string();
1016 
1018  if (set_sai) {
1020  if (err) aw_message(err);
1021  }
1022 
1023  aww->clear_option_menu(oms);
1024  aww->insert_default_option(selected_sai, "", selected_sai);
1025  GB_transaction ta(gb_main);
1026 
1027  for (GBDATA *sai = GBT_first_SAI(gb_main);
1028  sai;
1029  sai = GBT_next_SAI(sai))
1030  {
1031  const char *sai_name = GBT_get_name_or_description(sai);
1032  if (strcmp(sai_name, selected_sai) != 0 && strstr(sai_name, sai_filter)) {
1033  aww->callback(makeWindowCallback(ED4_pfold_select_SAI_and_update_option_menu, oms, true)); // @@@ used as OPTIONMENU_SELECT_CB (see #559)
1034  aww->insert_option(sai_name, "", sai_name);
1035  }
1036  }
1037 
1038  free(selected_sai);
1039  free(sai_filter);
1040  aww->update_option_menu();
1041  // ED4_expose_all_windows();
1042  // @@@ need update here ?
1043 }
1044 
1046  cdef.add(PFOLD_AWAR_ENABLE, "enable");
1047  cdef.add(PFOLD_AWAR_SELECTED_SAI, "sai");
1048  cdef.add(PFOLD_AWAR_SAI_FILTER, "saifilter");
1049  cdef.add(PFOLD_AWAR_MATCH_METHOD, "matchmethod");
1050  cdef.add(PFOLD_AWAR_SYMBOL_TEMPLATE_2, "matchsymbols2");
1051 
1052  char awar[256];
1053  for (int i = 0; pfold_match_type_awars[i].name; i++) {
1054  const char *name = pfold_match_type_awars[i].name;
1055  sprintf(awar, PFOLD_AWAR_PAIR_TEMPLATE, name);
1056  cdef.add(awar, name);
1057  sprintf(awar, PFOLD_AWAR_SYMBOL_TEMPLATE, name);
1058  cdef.add(awar, GBS_global_string("%s_symbol", name));
1059  }
1060 }
1061 
1062 AW_window *ED4_pfold_create_props_window(AW_root *awr, const WindowCallback *refreshCallback) {
1063  AW_window_simple *aws = new AW_window_simple;
1064  aws->init(awr, "PFOLD_PROPS", "PROTEIN_MATCH_SETTINGS");
1065 
1066  // create close button
1067  aws->at(10, 10);
1068  aws->auto_space(5, 2);
1069  aws->callback(AW_POPDOWN);
1070  aws->create_button("CLOSE", "CLOSE", "C");
1071 
1072  // create help button
1073  aws->callback(makeHelpCallback("pfold_props.hlp"));
1074  aws->create_button("HELP", "HELP");
1075 
1076  AWT_insert_config_manager(aws, AW_ROOT_DEFAULT, "pfold", makeConfigSetupCallback(setup_pfold_config));
1077 
1078  aws->at_newline();
1079 
1080  aws->label_length(27);
1081  int ex = 0, ey = 0;
1082  char awar[256];
1083 
1084  // create toggle field for showing the protein structure match
1085  aws->label("Show protein structure match?");
1086  aws->callback(*refreshCallback); // @@@ used as TOGGLE_CLICK_CB (see #559)
1087  aws->create_toggle(PFOLD_AWAR_ENABLE);
1088  aws->at_newline();
1089 
1090  // create SAI option menu
1091  aws->label_length(30);
1092  aws->label("Selected Protein Structure SAI");
1093  AW_option_menu_struct *oms_sai = aws->create_option_menu(PFOLD_AWAR_SELECTED_SAI, true);
1094  ED4_pfold_select_SAI_and_update_option_menu(aws, oms_sai, false);
1095  aws->at_newline();
1096  aws->label("-> Filter SAI names for");
1097  aws->callback(makeWindowCallback(ED4_pfold_select_SAI_and_update_option_menu, oms_sai, false)); // @@@ used as INPUTFIELD_CB (see #559)
1098  aws->create_input_field(PFOLD_AWAR_SAI_FILTER, 10);
1099  aws->at_newline();
1100 
1101  // create match method option menu
1103  aws->label_length(12);
1104  aws->label("Match Method");
1105  aws->create_option_menu(PFOLD_AWAR_MATCH_METHOD, true);
1106  for (int i = 0; const char *mm_aw = pfold_match_method_awars[i].name; i++) {
1107  aws->callback(*refreshCallback); // @@@ used as OPTIONMENU_SELECT_CB (see #559)
1108  if (match_method == pfold_match_method_awars[i].value) {
1109  aws->insert_default_option(mm_aw, "", match_method);
1110  }
1111  else {
1112  aws->insert_option(mm_aw, "", pfold_match_method_awars[i].value);
1113  }
1114  }
1115  aws->update_option_menu();
1116  aws->at_newline();
1117 
1118  // create match symbols and/or match types input fields
1119  // TODO: show only fields that are relevant for current match method -> bind to callback function?
1120  aws->label_length(40);
1121  aws->label("Match Symbols (Range 0-100% in steps of 10%)");
1122  aws->callback(*refreshCallback); // @@@ used as INPUTFIELD_CB (see #559)
1123  aws->create_input_field(PFOLD_AWAR_SYMBOL_TEMPLATE_2, 10);
1124  aws->at_newline();
1125  for (int i = 0; pfold_match_type_awars[i].name; i++) {
1126  aws->label_length(12);
1127  sprintf(awar, PFOLD_AWAR_PAIR_TEMPLATE, pfold_match_type_awars[i].name);
1128  aws->label(pfold_match_type_awars[i].name);
1129  aws->callback(*refreshCallback); // @@@ used as INPUTFIELD_CB (see #559)
1130  aws->create_input_field(awar, 30);
1131  // TODO: is it possible to disable input field for STRUCT_UNKNOWN?
1132  // if (pfold_match_type_awars[i].value == STRUCT_UNKNOWN)
1133  if (!i) aws->get_at_position(&ex, &ey);
1134  sprintf(awar, PFOLD_AWAR_SYMBOL_TEMPLATE, pfold_match_type_awars[i].name);
1135  aws->callback(*refreshCallback); // @@@ used as INPUTFIELD_CB (see #559)
1136  aws->create_input_field(awar, 3);
1137  aws->at_newline();
1138  }
1139 
1140  aws->window_fit();
1141  return aws;
1142 }
1143 
void insert_option(AW_label choice_label, const char *mnemonic, const char *var_value, const char *name_of_color=NULp)
const char * GB_ERROR
Definition: arb_core.h:25
Compare two protein secondary structures.
GBDATA * GBT_first_SAI(GBDATA *gb_main)
Definition: aditem.cxx:162
#define PFOLD_PAIRS
void add(const char *awar_name, const char *config_name)
AW_window * ED4_pfold_create_props_window(AW_root *awr, const WindowCallback *refreshCallback)
Creates the "Protein Match Settings" window.
static int * char2AA
Maps character to amino acid one letter code.
static name_value_pair pfold_match_method_awars[4]
Awars for the match method; binds the PFOLD_MATCH_METHOD to the corresponding name that is used to cr...
static double cf_parameters[20][4]
Former and breaker values for alpha-helices and beta-sheets (= strands).
#define PFOLD_AWAR_SAI_FILTER
Filter SAIs for given criteria (string); used in option menu for SAI selection.
void at(int x, int y)
Definition: AW_at.cxx:93
long
Definition: AW_awar.cxx:154
char * ARB_strdup(const char *str)
Definition: arb_string.h:27
void AWT_insert_config_manager(AW_window *aww, AW_default default_file_, const char *id, const StoreConfigCallback &store_cb, const RestoreConfigCallback &load_or_reset_cb, const char *macro_id, const AWT_predefined_config *predef)
long read_int() const
Definition: AW_awar.cxx:187
const char * GBS_global_string(const char *templat,...)
Definition: arb_msg.cxx:204
#define cf_former(aa, strct)
Returns the former value of an amino acid depending on the given structure type.
ED4_root * ED4_ROOT
Definition: ED4_main.cxx:48
#define PFOLD_AWAR_SYMBOL_TEMPLATE_2
Symbols for the match quality as used for match method SECSTRUCT_SEQUENCE.
static char * alignment_name
STL namespace.
void AW_POPDOWN(AW_window *window)
Definition: AW_window.cxx:52
char * pfold_pair_chars[PFOLD_PAIRS]
Symbols for the match quality (defined by PFOLD_MATCH_TYPE) as used for match methods SECSTRUCT_SECST...
Compare an amino acid sequence with a reference protein secondary structure.
name_value_pair pfold_match_type_awars[]
Awars for the match type; binds the PFOLD_MATCH_TYPE to the corresponding awar name.
char * protstruct
Definition: ed4_class.hxx:1455
static double max_breaker_value[3]
Maximum breaker value for alpha-helix, beta-sheet (in cf_parameters) and beta-turn (no breaker values...
void update_option_menu()
char * pfold_pairs[PFOLD_PAIRS]
Match pair definition (see PFOLD_MATCH_TYPE) as used for match methods SECSTRUCT_SECSTRUCT and SECSTR...
static void ED4_pfold_select_SAI_and_update_option_menu(AW_window *aww, AW_option_menu_struct *oms, bool set_sai)
Callback function to select the reference protein structure SAI and to update the SAI option menu...
static const char * structure_indifferent[2]
Amino acids that are indifferent for a certain structure (ALPHA_HELIX or BETA_SHEET) as used in ED4_p...
static void ED4_pfold_find_turns(const unsigned char *sequence, char *structure, int length)
Predicts beta-turns from the given amino acid sequence.
static HelixNrInfo * start
#define PFOLD_AWAR_MATCH_METHOD
Selected method for computing the match quality (see PFOLD_MATCH_METHOD).
GBDATA * GBT_find_SAI(GBDATA *gb_main, const char *name)
Definition: aditem.cxx:177
static double cf_parameters_norm[20][7]
Normalized former values for alpha-helices, beta-sheets (= strands) and beta-turns as well as beta-tu...
GB_ERROR GB_export_error(const char *error)
Definition: arb_msg.cxx:259
WindowCallback makeHelpCallback(const char *helpfile)
Definition: aw_window.hxx:106
int ED4_pfold_round_sym(double d)
Symmetric arithmetic rounding of a double value to an integer value.
Unknown structure.
#define cf_breaker(aa, strct)
Returns the breaker value of an amino acid depending on the given structure type. ...
GB_ERROR ED4_pfold_set_SAI(char **protstruct, GBDATA *gb_main, const char *alignment_name, long *protstruct_len)
Sets the reference protein secondary structure SAI.
#define PFOLD_AWAR_SYMBOL_TEMPLATE
Symbols for the match quality as used for match methods SECSTRUCT_SECSTRUCT and SECSTRUCT_SEQUENCE_PR...
GBDATA * get_gb_main() const
Definition: ed4_class.hxx:1426
char * alignment_name
Definition: ed4_class.hxx:1445
static double min_former_value[3]
Minimum former value for alpha-helix, beta-sheet (in cf_parameters) and beta-turn (in cf_parameters_n...
#define e4_assert(bed)
static const char * structure_breaker[2]
Amino acids that break a certain structure (ALPHA_HELIX or BETA_SHEET) as used in ED4_pfold_extend_nu...
static void error(const char *msg)
Definition: mkptypes.cxx:96
PFOLD_MATCH_METHOD
Defines the methods for match computation. For details refer to ED4_pfold_calculate_secstruct_match()...
#define PFOLD_AWAR_PAIR_TEMPLATE
Structure pairs that define the match quality (see pfold_pairs) as used for match methods SECSTRUCT_S...
char * read_string() const
Definition: AW_awar.cxx:201
const char * name
Name or description.
AW_awar * awar(const char *awar)
Definition: AW_root.cxx:554
Defines a name-value pair (e.g. for awars, menu entries, etc.).
GBDATA * GBT_find_sequence(GBDATA *gb_species, const char *aliname)
Definition: adali.cxx:670
void clear_option_menu(AW_option_menu_struct *oms)
Compare a full prediction of the protein secondary structure from its amino acid sequence with a refe...
void insert_default_option(AW_label choice_label, const char *mnemonic, const char *var_value, const char *name_of_color=NULp)
static struct pfold_mem_handler pfold_dealloc
GBDATA * GBT_next_SAI(GBDATA *gb_sai)
Definition: aditem.cxx:166
static char structure_chars[3]
Characters representing protein secondary structure.
long protstruct_len
Definition: ed4_class.hxx:1456
#define PFOLD_AWAR_ENABLE
Enable structure match.
PFOLD_STRUCTURE
Protein secondary structure types.
static double max_former_value[3]
Maximum former value for alpha-helix, beta-sheet (in cf_parameters) and beta-turn (in cf_parameters_n...
static void ED4_pfold_find_nucleation_sites(const unsigned char *sequence, char *structure, int length, const PFOLD_STRUCTURE s)
Finds nucleation sites that initiate the specified structure.
static void ED4_pfold_init_statics()
Initializes static variables.
char * GB_read_string(GBDATA *gbd)
Definition: arbdb.cxx:903
static const char * amino_acids
Specifies the characters used for amino acid one letter code.
AW_root * aw_root
Definition: ed4_class.hxx:1429
void aw_message(const char *msg)
Definition: AW_status.cxx:932
static void setup_pfold_config(AWT_config_definition &cdef)
static void ED4_pfold_resolve_overlaps(const unsigned char *sequence, char *structures[4], int length)
Resolves overlaps of predicted secondary structures and creates structure summary.
#define NULp
Definition: cxxforward.h:97
#define ED4_AWAR_GAP_CHARS
Definition: ed4_awars.hxx:35
static void ED4_pfold_extend_nucleation_sites(const unsigned char *sequence, char *structure, int length, const PFOLD_STRUCTURE s)
Extends the found nucleation sites in both directions.
static GB_ERROR ED4_pfold_predict_structure(const unsigned char *sequence, char *structures[4], int length)
Predicts protein secondary structures from the amino acid sequence.
GB_transaction ta(gb_var)
void callback(const WindowCallback &cb)
Definition: AW_window.cxx:130
GBDATA * gb_main
Definition: adname.cxx:33
GB_CSTR GBT_get_name_or_description(GBDATA *gb_item)
Definition: aditem.cxx:441
size_t length
GB_ERROR ED4_pfold_calculate_secstruct_match(const unsigned char *structure_sai, const unsigned char *structure_cmp, const int start, const int end, char *result_buffer, PFOLD_MATCH_METHOD match_method)
Compares a protein secondary structure with a primary structure or another secondary structure...
#define min(a, b)
Definition: f2c.h:153
#define AW_ROOT_DEFAULT
Definition: aw_base.hxx:106
GB_ERROR write_int(long aw_int)
#define PFOLD_AWAR_SELECTED_SAI
Selected reference protein secondary structure SAI (i.e. the SAI that is used for structure compariso...
Adds support for protein structure prediction, comparison of two protein secondary structures and of ...
GB_write_int const char s
Definition: AW_awar.cxx:156