ARB
DI_protdist.cxx
Go to the documentation of this file.
1 // =============================================================== //
2 // //
3 // File : DI_protdist.cxx //
4 // Purpose : //
5 // //
6 // Institute of Microbiology (Technical University Munich) //
7 // http://www.arb-home.de/ //
8 // //
9 // =============================================================== //
10 
11 #include "di_protdist.hxx"
12 #include "di_matr.hxx"
13 #include <aw_msg.hxx>
14 #include <arb_progress.h>
15 #include <cmath>
16 
17 #define epsilon 0.000001 // a small number
18 
19 double di_protdist::pameigs[20] = {
20  -0.022091252, -0.019297602, 0.000004760, -0.017477817,
21  -0.016575549, -0.015504543, -0.002112213, -0.002685727,
22  -0.002976402, -0.013440755, -0.012926992, -0.004293227,
23  -0.005356688, -0.011064786, -0.010480731, -0.008760449,
24  -0.007142318, -0.007381851, -0.007806557, -0.008127024
25 };
26 
27 double di_protdist::pamprobs[20][20] = {
28  {
29  -0.01522976, -0.00746819, -0.13934468, 0.11755315, -0.00212101,
30  0.01558456, -0.07408235, -0.00322387, 0.01375826, 0.00448826,
31  0.00154174, 0.02013313, -0.00159183, -0.00069275, -0.00399898,
32  0.08414055, -0.01188178, -0.00029870, 0.00220371, 0.00042546
33  },
34  {
35  -0.07765582, -0.00712634, -0.03683209, -0.08065755, -0.00462872,
36  -0.03791039, 0.10642147, -0.00912185, 0.01436308, -0.00133243,
37  0.00166346, 0.00624657, -0.00003363, -0.00128729, -0.00690319,
38  0.17442028, -0.05373336, -0.00078751, -0.00038151, 0.01382718
39  },
40  {
41  -0.08810973, -0.04081786, -0.04066004, -0.04736004, -0.03275406,
42  -0.03761164, -0.05047487, -0.09086213, -0.03269598, -0.03558015,
43  -0.08407966, -0.07970977, -0.01504743, -0.04011920, -0.05182232,
44  -0.07026991, -0.05846931, -0.01016998, -0.03047472, -0.06280511
45  },
46  {
47  0.02513756, -0.00578333, 0.09865453, 0.01322314, -0.00310665,
48  0.05880899, -0.09252443, -0.02986539, -0.03127460, 0.01007539,
49  -0.00360119, -0.01995024, 0.00094940, -0.00145868, -0.01388816,
50  0.11358341, -0.12127513, -0.00054696, -0.00055627, 0.00417284
51  },
52  {
53  0.16517316, -0.00254742, -0.03318745, -0.01984173, 0.00031890,
54  -0.02817810, 0.02661678, -0.01761215, 0.01665112, 0.10513343,
55  -0.00545026, 0.01827470, -0.00207616, -0.00763758, -0.01322808,
56  -0.02202576, -0.07434204, 0.00020593, 0.00119979, -0.10827873
57  },
58  {
59  0.16088826, 0.00056313, -0.02579303, -0.00319655, 0.00037228,
60  -0.03193150, 0.01655305, -0.03028640, 0.01367746, -0.11248153,
61  0.00778371, 0.02675579, 0.00243718, 0.00895470, -0.01729803,
62  -0.02686964, -0.08262584, 0.00011794, -0.00225134, 0.09415650
63  },
64  {
65  -0.01739295, 0.00572017, -0.00712592, -0.01100922, -0.00870113,
66  -0.00663461, -0.01153857, -0.02248432, -0.00382264, -0.00358612,
67  -0.00139345, -0.00971460, -0.00133312, 0.01927783, -0.01053838,
68  -0.00911362, -0.01010908, 0.09417598, 0.01763850, -0.00955454
69  },
70  {
71  0.01728888, 0.01344211, 0.01200836, 0.01857259, -0.17088517,
72  0.01457592, 0.01997839, 0.02844884, 0.00839403, 0.00196862,
73  0.01391984, 0.03270465, 0.00347173, -0.01940984, 0.01233979,
74  0.00542887, 0.01008836, 0.00126491, -0.02863042, 0.00449764
75  },
76  {
77  -0.02881366, -0.02184155, -0.01566086, -0.02593764, -0.04050907,
78  -0.01539603, -0.02576729, -0.05089606, -0.00597430, 0.02181643,
79  0.09835597, -0.04040940, 0.00873512, 0.12139434, -0.02427882,
80  -0.02945238, -0.01566867, -0.01606503, 0.09475319, 0.02238670
81  },
82  {
83  0.04080274, -0.02869626, -0.05191093, -0.08435843, 0.00021141,
84  0.13043842, 0.00871530, 0.00496058, -0.02797641, -0.00636933,
85  0.02243277, 0.03640362, -0.05735517, 0.00196918, -0.02218934,
86  -0.00608972, 0.02872922, 0.00047619, 0.00151285, 0.00883489
87  },
88  {
89  -0.02623824, 0.00331152, 0.03640692, 0.04260231, -0.00038223,
90  -0.07480340, -0.01022492, -0.00426473, 0.01448116, 0.01456847,
91  0.05786680, 0.03368691, -0.10126924, -0.00147454, 0.01275395,
92  0.00017574, -0.01585206, -0.00015767, -0.00231848, 0.02310137
93  },
94  {
95  -0.00846258, -0.01508106, -0.01967505, -0.02772004, 0.01248253,
96  -0.01331243, -0.02569382, -0.04461524, -0.02207075, 0.04663443,
97  0.19347923, -0.02745691, 0.02288515, -0.04883849, -0.01084597,
98  -0.01947187, -0.00081675, 0.00516540, -0.07815919, 0.08035585
99  },
100  {
101  -0.06553111, 0.09756831, 0.00524326, -0.00885098, 0.00756653,
102  0.02783099, -0.00427042, -0.16680359, 0.03951331, -0.00490540,
103  0.01719610, 0.15018204, 0.00882722, -0.00423197, -0.01919217,
104  -0.02963619, -0.01831342, -0.00524338, 0.00011379, -0.02566864
105  },
106  {
107  -0.07494341, -0.11348850, 0.00241343, -0.00803016, 0.00492438,
108  0.00711909, -0.00829147, 0.05793337, 0.02734209, 0.02059759,
109  -0.02770280, 0.14128338, 0.01532479, 0.00364307, 0.05968116,
110  -0.06497960, -0.08113941, 0.00319445, -0.00104222, 0.03553497
111  },
112  {
113  0.05948223, -0.08959930, 0.03269977, -0.03272374, -0.00365667,
114  -0.03423294, -0.06418925, -0.05902138, 0.05746317, -0.02580596,
115  0.01259572, 0.05848832, 0.00672666, 0.00233355, -0.05145149,
116  0.07348503, 0.11427955, 0.00142592, -0.01030651, -0.04862799
117  },
118  {
119  -0.01606880, 0.05200845, -0.01212967, -0.06824429, -0.00234304,
120  0.01094203, -0.07375538, 0.08808629, 0.12394822, 0.02231351,
121  -0.03608265, -0.06978045, -0.00618360, 0.00274747, -0.01921876,
122  -0.01541969, -0.02223856, -0.00107603, -0.01251777, 0.05412534
123  },
124  {
125  0.01688843, 0.05784728, -0.02256966, -0.07072251, -0.00422551,
126  -0.06261233, -0.08502830, 0.08925346, -0.08529597, 0.01519343,
127  -0.05008258, 0.10931873, 0.00521033, 0.02593305, -0.00717855,
128  0.02291527, 0.02527388, -0.00266188, -0.00871160, 0.02708135
129  },
130  {
131  -0.04233344, 0.00076379, 0.01571257, 0.04003092, 0.00901468,
132  0.00670577, 0.03459487, 0.12420216, -0.00067366, -0.01515094,
133  0.05306642, 0.04338407, 0.00511287, 0.01036639, -0.17867462,
134  -0.02289440, -0.03213205, 0.00017924, -0.01187362, -0.03933874
135  },
136  {
137  0.01284817, -0.01685622, 0.00724363, 0.01687952, -0.00882070,
138  -0.00555957, 0.01676246, -0.05560456, -0.00966893, 0.06197684,
139  -0.09058758, 0.00880607, 0.00108629, -0.08308956, -0.08056832,
140  -0.00413297, 0.02973107, 0.00092948, 0.07010111, 0.13007418
141  },
142  {
143  0.00700223, -0.01347574, 0.00691332, 0.03122905, 0.00310308,
144  0.00946862, 0.03455040, -0.06712536, -0.00304506, 0.04267941,
145  -0.10422292, -0.01127831, -0.00549798, 0.11680505, -0.03352701,
146  -0.00084536, 0.01631369, 0.00095063, -0.09570217, 0.06480321
147  }
148 };
149 
150 void di_protdist::maketrans() {
151  // Make up transition probability matrix from code and category tables
152  long i, j, k, m, n, s;
153  double x, sum = 0;
154  long sub[3], newsub[3];
155  double f[4], g[4];
156  aas b1, b2;
157  double TEMP, TEMP1, TEMP2, TEMP3;
158 
159  for (i = 0; i <= 19; i++) {
160  pi[i] = 0.0;
161  for (j = 0; j <= 19; j++)
162  prob[i][j] = 0.0;
163  }
164  f[0] = freqt;
165  f[1] = freqc;
166  f[2] = freqa;
167  f[3] = freqg;
168  g[0] = freqc + freqt;
169  g[1] = freqc + freqt;
170  g[2] = freqa + freqg;
171  g[3] = freqa + freqg;
172  TEMP = f[0];
173  TEMP1 = f[1];
174  TEMP2 = f[2];
175  TEMP3 = f[3];
176  fracchange = xi * (2 * f[0] * f[1] / g[0] + 2 * f[2] * f[3] / g[2]) +
177  xv * (1 - TEMP * TEMP - TEMP1 * TEMP1 - TEMP2 * TEMP2 - TEMP3 * TEMP3);
178  for (i = 0; i <= 3; i++) {
179  for (j = 0; j <= 3; j++) {
180  for (k = 0; k <= 3; k++) {
181  if (trans[i][j][k] != STOP)
182  sum += f[i] * f[j] * f[k];
183  }
184  }
185  }
186  for (i = 0; i <= 3; i++) {
187  sub[0] = i + 1;
188  for (j = 0; j <= 3; j++) {
189  sub[1] = j + 1;
190  for (k = 0; k <= 3; k++) {
191  sub[2] = k + 1;
192  b1 = trans[i][j][k];
193  for (m = 0; m <= 2; m++) {
194  s = sub[m];
195  for (n = 1; n <= 4; n++) {
196  memcpy(newsub, sub, sizeof(long) * 3L);
197  newsub[m] = n;
198  x = f[i] * f[j] * f[k] / (3.0 * sum);
199  if (((s == 1 || s == 2) && (n == 3 || n == 4)) ||
200  ((n == 1 || n == 2) && (s == 3 || s == 4)))
201  x *= xv * f[n - 1];
202  else
203  x *= xi * f[n - 1] / g[n - 1] + xv * f[n - 1];
204  b2 = trans[newsub[0] - 1][newsub[1] - 1][newsub[2] - 1];
205  if (b1 != STOP) {
206  pi[b1] += x;
207  if (b2 != STOP) {
208  if (cat[b1] != cat[b2]) {
209  prob[b1][b2] += x * ease;
210  prob[b1][b1] += x * (1.0 - ease);
211  } else
212  prob[b1][b2] += x;
213  } else
214  prob[b1][b1] += x;
215  }
216  }
217  }
218  }
219  }
220  }
221  for (i = 0; i <= 19; i++) // LOOP_VECTORIZED[!<910]
222  prob[i][i] -= pi[i];
223  for (i = 0; i <= 19; i++) {
224  for (j = 0; j <= 19; j++)
225  prob[i][j] /= sqrt(pi[i] * pi[j]);
226  }
227  // computes pi^(1/2)*B*pi^(-1/2)
228 }
229 
230 void di_protdist::code() {
231  // make up table of the code 0 = u, 1 = c, 2 = a, 3 = g
232 
233  trans[0][0][0] = PHE;
234  trans[0][0][1] = PHE;
235  trans[0][0][2] = LEU;
236  trans[0][0][3] = LEU;
237  trans[0][1][0] = SER;
238  trans[0][1][1] = SER;
239  trans[0][1][2] = SER;
240  trans[0][1][3] = SER;
241  trans[0][2][0] = TYR;
242  trans[0][2][1] = TYR;
243  trans[0][2][2] = STOP;
244  trans[0][2][3] = STOP;
245  trans[0][3][0] = CYS;
246  trans[0][3][1] = CYS;
247  trans[0][3][2] = STOP;
248  trans[0][3][3] = TRP;
249  trans[1][0][0] = LEU;
250  trans[1][0][1] = LEU;
251  trans[1][0][2] = LEU;
252  trans[1][0][3] = LEU;
253  trans[1][1][0] = PRO;
254  trans[1][1][1] = PRO;
255  trans[1][1][2] = PRO;
256  trans[1][1][3] = PRO;
257  trans[1][2][0] = HIS;
258  trans[1][2][1] = HIS;
259  trans[1][2][2] = GLN;
260  trans[1][2][3] = GLN;
261  trans[1][3][0] = ARG;
262  trans[1][3][1] = ARG;
263  trans[1][3][2] = ARG;
264  trans[1][3][3] = ARG;
265  trans[2][0][0] = ILEU;
266  trans[2][0][1] = ILEU;
267  trans[2][0][2] = ILEU;
268  trans[2][0][3] = MET;
269  trans[2][1][0] = THR;
270  trans[2][1][1] = THR;
271  trans[2][1][2] = THR;
272  trans[2][1][3] = THR;
273  trans[2][2][0] = ASN;
274  trans[2][2][1] = ASN;
275  trans[2][2][2] = LYS;
276  trans[2][2][3] = LYS;
277  trans[2][3][0] = SER;
278  trans[2][3][1] = SER;
279  trans[2][3][2] = ARG;
280  trans[2][3][3] = ARG;
281  trans[3][0][0] = VAL;
282  trans[3][0][1] = VAL;
283  trans[3][0][2] = VAL;
284  trans[3][0][3] = VAL;
285  trans[3][1][0] = ALA;
286  trans[3][1][1] = ALA;
287  trans[3][1][2] = ALA;
288  trans[3][1][3] = ALA;
289  trans[3][2][0] = ASP;
290  trans[3][2][1] = ASP;
291  trans[3][2][2] = GLU;
292  trans[3][2][3] = GLU;
293  trans[3][3][0] = GLY;
294  trans[3][3][1] = GLY;
295  trans[3][3][2] = GLY;
296  trans[3][3][3] = GLY;
297 
298  switch (whichcode) {
299  case UNIVERSAL:
300  case CILIATE:
301  break; // use default code above
302 
303  case MITO:
304  trans[0][3][2] = TRP;
305  break;
306 
307  case VERTMITO:
308  trans[0][3][2] = TRP;
309  trans[2][3][2] = STOP;
310  trans[2][3][3] = STOP;
311  trans[2][0][2] = MET;
312  break;
313 
314  case FLYMITO:
315  trans[0][3][2] = TRP;
316  trans[2][0][2] = MET;
317  trans[2][3][2] = SER;
318  break;
319 
320  case YEASTMITO:
321  trans[0][3][2] = TRP;
322  trans[1][0][2] = THR;
323  trans[2][0][2] = MET;
324  break;
325  }
326 }
327 
328 void di_protdist::transition() {
329  // calculations related to transition-transversion ratio
330 
331  double freqr = freqa + freqg;
332  double freqy = freqc + freqt;
333  double freqgr = freqg / freqr;
334  double freqty = freqt / freqy;
335 
336  double aa = ttratio * freqr * freqy - freqa * freqg - freqc * freqt;
337  double bb = freqa * freqgr + freqc * freqty;
338 
339  xi = aa / (aa + bb);
340  xv = 1.0 - xi;
341 
342  if (xi <= 0.0 && xi >= -epsilon) {
343  xi = 0.0;
344  }
345  if (xi < 0.0) {
346  GBK_terminate("This transition-transversion ratio is impossible with these base frequencies"); // @@@ should be handled better
347  }
348 }
349 
350 void di_protdist::givens(di_aa_matrix a, long i, long j, long n, double ctheta, double stheta, bool left) {
351  // Givens transform at i,j for 1..n with angle theta
352 
353  if (left) {
354  for (long k = 0; k < n; k++) { // LOOP_VECTORIZED
355  double d = ctheta * a[i - 1][k] + stheta * a[j - 1][k];
356  a[j - 1][k] = ctheta * a[j - 1][k] - stheta * a[i - 1][k];
357  a[i - 1][k] = d;
358  }
359  }
360  else {
361  for (long k = 0; k < n; k++) { // LOOP_VECTORIZED[!<8.1]
362  double d = ctheta * a[k][i - 1] + stheta * a[k][j - 1];
363  a[k][j - 1] = ctheta * a[k][j - 1] - stheta * a[k][i - 1];
364  a[k][i - 1] = d;
365  }
366  }
367 }
368 
369 void di_protdist::coeffs(double x, double y, double *c, double *s, double accuracy) {
370  // compute cosine and sine of theta
371  double root = hypot(x, y);
372  if (root < accuracy) {
373  *c = 1.0;
374  *s = 0.0;
375  }
376  else {
377  *c = x / root;
378  *s = y / root;
379  }
380 }
381 
382 void di_protdist::tridiag(di_aa_matrix a, long n, double accuracy) {
383  // Givens tridiagonalization
384  long i, j;
385  double s, c;
386 
387  for (i = 2; i < n; i++) {
388  for (j = i + 1; j <= n; j++) {
389  coeffs(a[i - 2][i - 1], a[i - 2][j - 1], &c, &s, accuracy);
390  givens(a, i, j, n, c, s, true);
391  givens(a, i, j, n, c, s, false);
392  givens(eigvecs, i, j, n, c, s, true);
393  }
394  }
395 }
396 
397 void di_protdist::shiftqr(di_aa_matrix a, long n, double accuracy) {
398  // QR eigenvalue-finder
399  for (long i = n; i >= 2; i--) {
400  do {
401  const double& ai1 = a[i - 1][i - 1];
402  const double& ai2 = a[i - 2][i - 2];
403  const double d = hypot(ai2 - ai1, a[i - 1][i - 2]);
404  double approx = ai2 + ai1;
405 
406  if (ai1 < ai2) {
407  approx = (approx - d) / 2.0;
408  }
409  else {
410  approx = (approx + d) / 2.0;
411  }
412 
413  for (long j = 0; j < i; j++) {
414  a[j][j] -= approx;
415  }
416 
417  for (long j = 1; j < i; j++) {
418  double s;
419  double c;
420  coeffs(a[j - 1][j - 1], a[j][j - 1], &c, &s, accuracy);
421  givens(a, j, j + 1, i, c, s, true);
422  givens(a, j, j + 1, i, c, s, false);
423  givens(eigvecs, j, j + 1, n, c, s, true);
424  }
425 
426  for (long j = 0; j < i; j++) {
427  a[j][j] += approx;
428  }
429  }
430  while (fabs(a[i - 1][i - 2]) > accuracy);
431  }
432 }
433 
434 
435 void di_protdist::qreigen(di_aa_matrix proba, long n) {
436  // QR eigenvector/eigenvalue method for symmetric matrix
437  double accuracy;
438  long i, j;
439 
440  accuracy = 1.0e-6;
441  for (i = 0; i < n; i++) {
442  for (j = 0; j < n; j++)
443  eigvecs[i][j] = 0.0;
444  eigvecs[i][i] = 1.0;
445  }
446  tridiag(proba, n, accuracy);
447  shiftqr(proba, n, accuracy);
448  for (i = 0; i < n; i++)
449  eig[i] = proba[i][i];
450  for (i = 0; i <= 19; i++) {
451  for (j = 0; j <= 19; j++)
452  proba[i][j] = sqrt(pi[j]) * eigvecs[i][j];
453  }
454  // proba[i][j] is the value of U' times pi^(1/2)
455 }
456 
457 
458 void di_protdist::pameigen() {
459  // eigenanalysis for PAM matrix, precomputed
460  memcpy(prob, pamprobs, sizeof(pamprobs));
461  memcpy(eig, pameigs, sizeof(pameigs));
462  fracchange = 0.01;
463 }
464 
465 void di_protdist::build_exptteig(double tt) {
466  int m;
467  for (m = 0; m <= 19; m++) {
468  exptteig[m] = exp(tt * eig[m]);
469  }
470 }
471 
472 void di_protdist::predict(double /* tt */, long nb1, long nb2) {
473  // make contribution to prediction of this aa pair
474  for (long m = 0; m <= 19; m++) { // LOOP_VECTORIZED[!<8.1]
475  double q = prob[m][nb1] * prob[m][nb2] * exptteig[m];
476  p += q;
477  double TEMP = eig[m];
478  dp += TEMP * q;
479  d2p += TEMP * TEMP * q;
480  }
481 }
482 
483 void di_protdist::build_predikt_table(int pos) {
484  double tt = pos_2_tt(pos);
485  build_exptteig(tt);
486 
487  akt_slopes = slopes[pos] = ARB_calloc<di_paa_matrix>(1);
488  akt_curves = curves[pos] = ARB_calloc<di_paa_matrix>(1);
489  akt_infs = infs[pos] = ARB_calloc<di_bool_matrix>(1);
490 
491  for (int b1 = ALA; b1 < DI_MAX_PAA; b1++) {
492  for (int b2 = ALA; b2 <= b1; b2++) {
493  if (b1 != STOP && b1 != DEL && b1 != QUEST && b1 != UNK &&
494  b2 != STOP && b2 != DEL && b2 != QUEST && b2 != UNK)
495  {
496  p = 0.0;
497  dp = 0.0;
498  d2p = 0.0;
499 
500  if (b1 != ASX && b1 != GLX && b2 != ASX && b2 != GLX) {
501  predict(tt, b1, b2);
502  }
503  else {
504  if (b1 == ASX) {
505  if (b2 == ASX) {
506  predict(tt, 2L, 2L);
507  predict(tt, 2L, 3L);
508  predict(tt, 3L, 2L);
509  predict(tt, 3L, 3L);
510  }
511  else {
512  if (b2 == GLX) {
513  predict(tt, 2L, 5L);
514  predict(tt, 2L, 6L);
515  predict(tt, 3L, 5L);
516  predict(tt, 3L, 6L);
517  }
518  else {
519  predict(tt, 2L, b2);
520  predict(tt, 3L, b2);
521  }
522  }
523  }
524  else {
525  if (b1 == GLX) {
526  if (b2 == ASX) {
527  predict(tt, 5L, 2L);
528  predict(tt, 5L, 3L);
529  predict(tt, 6L, 2L);
530  predict(tt, 6L, 3L);
531  }
532  else {
533  if (b2 == GLX) {
534  predict(tt, 5L, 5L);
535  predict(tt, 5L, 6L);
536  predict(tt, 6L, 5L);
537  predict(tt, 6L, 6L);
538  }
539  else {
540  predict(tt, 5L, b2);
541  predict(tt, 6L, b2);
542  }
543  }
544  }
545  else {
546  if (b2 == ASX) {
547  predict(tt, b1, 2L);
548  predict(tt, b1, 3L);
549  predict(tt, b1, 2L);
550  predict(tt, b1, 3L);
551  }
552  else if (b2 == GLX) {
553  predict(tt, b1, 5L);
554  predict(tt, b1, 6L);
555  predict(tt, b1, 5L);
556  predict(tt, b1, 6L);
557  }
558  }
559  }
560  }
561  if (p > 0.0) {
562  akt_slopes[0][b1][b2] = dp / p;
563  akt_curves[0][b1][b2] = d2p / p - dp * dp / (p * p);
564  akt_infs[0][b1][b2] = 0;
565  akt_slopes[0][b2][b1] = akt_slopes[0][b1][b2];
566  akt_curves[0][b2][b1] = akt_curves[0][b1][b2];
567  akt_infs[0][b2][b1] = 0;
568  }
569  else {
570  akt_infs[0][b1][b2] = 1;
571  akt_infs[0][b2][b1] = 1;
572  }
573  }
574  }
575  }
576 }
577 
578 int di_protdist::tt_2_pos(double tt) {
579  int pos = (int)(tt * fracchange * DI_RESOLUTION);
580  if (pos >= DI_RESOLUTION * DI_MAX_DIST)
581  pos = DI_RESOLUTION * DI_MAX_DIST - 1;
582  if (pos < 0)
583  pos = 0;
584  return pos;
585 }
586 
587 double di_protdist::pos_2_tt(int pos) {
588  double tt = pos / (fracchange * DI_RESOLUTION);
589  return tt+epsilon;
590 }
591 
592 void di_protdist::build_akt_predikt(double tt) {
593  // take an actual slope from the hash table, else calculate a new one
594  int pos = tt_2_pos(tt);
595  if (!slopes[pos]) {
596  build_predikt_table(pos);
597  }
598  akt_slopes = slopes[pos];
599  akt_curves = curves[pos];
600  akt_infs = infs[pos];
601  return;
602 
603 }
604 
605 GB_ERROR di_protdist::makedists(bool *aborted_flag) {
606  /* compute the distances.
607  * sets 'aborted_flag' to true, if it is non-NULp and user aborts the calculation
608  */
609  long i, j, k, m, n, iterations;
610  double delta, slope, curv;
611  int b1 = 0, b2=0;
612  double tt = 0;
613  int pos;
614 
615  arb_progress progress("Calculating distances", matrix_halfsize(spp, false));
616  GB_ERROR error = NULp;
617 
618  for (i = 0; i < spp && !error; i++) {
619  matrix->set(i, i, 0.0);
620  {
621  // move all unknown characters to del
622  ap_pro *seq1 = entries[i]->get_prot_seq()->get_sequence();
623  for (k = 0; k <chars; k++) {
624  b1 = seq1[k];
625  if (b1 <= VAL) continue;
626  if (b1 == ASX || b1 == GLX) continue;
627  seq1[k] = DEL;
628  }
629  }
630 
631  for (j = 0; j < i && !error; j++) {
632  if (whichcat > KIMURA) {
633  if (whichcat == PAM)
634  tt = 10.0;
635  else
636  tt = 1.0;
637  delta = tt / 2.0;
638  iterations = 0;
639  do {
640  slope = 0.0;
641  curv = 0.0;
642  pos = tt_2_pos(tt);
643  tt = pos_2_tt(pos);
644  build_akt_predikt(tt);
645  const ap_pro *seq1 = entries[i]->get_prot_seq()->get_sequence();
646  const ap_pro *seq2 = entries[j]->get_prot_seq()->get_sequence();
647  for (k = chars; k >0; k--) {
648  b1 = *(seq1++);
649  b2 = *(seq2++);
650  if (predict_infinity(b1, b2)) {
651  break;
652  }
653  slope += predict_slope(b1, b2);
654  curv += predict_curve(b1, b2);
655  }
656  iterations++;
657  if (!predict_infinity(b1, b2)) {
658  if (curv < 0.0) {
659  tt -= slope / curv;
660  if (tt > 10000.0) {
661  aw_message(GBS_global_string("Warning: infinite distance between species '%s' and '%s'\n", entries[i]->name, entries[j]->name));
662  tt = -1.0 / fracchange;
663  break;
664  }
665  int npos = tt_2_pos(tt);
666  int d = npos - pos; if (d<0) d=-d;
667  if (d<=1) { // cannot optimize
668  break;
669  }
670 
671  }
672  else {
673  if ((slope > 0.0 && delta < 0.0) || (slope < 0.0 && delta > 0.0))
674  delta /= -2;
675  if (tt + delta < 0 && tt <= epsilon) {
676  break;
677  }
678  tt += delta;
679  }
680  }
681  else {
682  delta /= -2;
683  tt += delta;
684  if (tt < 0) tt = 0;
685  }
686  } while (iterations < 20);
687  }
688  else { // cat < kimura
689  m = 0;
690  n = 0;
691  const ap_pro *seq1 = entries[i]->get_prot_seq()->get_sequence();
692  const ap_pro *seq2 = entries[j]->get_prot_seq()->get_sequence();
693  for (k = chars; k >0; k--) {
694  b1 = *(seq1++);
695  b2 = *(seq2++);
696  if (b1 <= VAL && b2 <= VAL) {
697  if (b1 == b2) m++;
698  n++;
699  }
700  }
701  if (n < 5) { // no info
702  tt = -1.0;
703  }
704  else {
705  switch (whichcat) {
706  case KIMURA: {
707  double rel = 1 - (double) m / n;
708  double drel = 1.0 - rel - 0.2 * rel * rel;
709  if (drel < 0.0) {
710  aw_message(GBS_global_string("Warning: distance between sequences '%s' and '%s' is too large for kimura formula", entries[i]->name, entries[j]->name));
711  tt = -1.0;
712  }
713  else {
714  tt = -log(drel);
715  }
716  break;
717  }
718  case NONE:
719  tt = (n-m)/(double)n;
720  break;
721  case SIMILARITY:
722  tt = m/(double)n;
723  break;
724  default:
725  di_assert(0);
726  break;
727  }
728  }
729  }
730  matrix->set(i, j, fracchange * tt);
731  progress.inc_and_check_user_abort(error);
732  }
733  }
734  if (aborted_flag && error) *aborted_flag = true;
735  return error;
736 }
737 
738 
739 void di_protdist::clean_slopes() {
740  for (int i=0; i<DI_RESOLUTION*DI_MAX_DIST; i++) {
741  freenull(slopes[i]);
742  freenull(curves[i]);
743  freenull(infs[i]);
744  }
745  akt_slopes = NULp;
746  akt_curves = NULp;
747  akt_infs = NULp;
748 }
749 
751  clean_slopes();
752 }
753 
754 di_protdist::di_protdist(di_codetype code_, di_cattype cat_, long nentries, DI_ENTRY **entries_, long seq_len, AP_smatrix *matrix_)
755  : whichcode(code_),
756  whichcat(cat_),
757  spp(nentries),
758  chars(seq_len),
759  freqa(.25),
760  freqc(.25),
761  freqg(.25),
762  freqt(.25),
763  ttratio(2.0),
764  ease(0.457),
765  fracchange(0.0),
766  entries(entries_),
767  akt_slopes(NULp),
768  akt_curves(NULp),
769  akt_infs(NULp),
770  matrix(matrix_),
771  p(0.0),
772  dp(0.0),
773  d2p(0.0)
774 {
775  memset(trans, 0, sizeof(trans));
776  memset(pi, 0, sizeof(pi));
777 
778  for (int i = 0; i<DI_MAX_AA; ++i) {
779  cat[i] = 0;
780  eig[i] = 0;
781  exptteig[i] = 0;
782 
783  for (int j = 0; j<DI_MAX_AA; ++j) {
784  prob[i][j] = 0;
785  eigvecs[i][j] = 0;
786  }
787  }
788 
789  for (int i = 0; i<(DI_RESOLUTION*DI_MAX_DIST); ++i) {
790  slopes[i] = NULp;
791  curves[i] = NULp;
792  infs[i] = NULp;
793  }
794 
795  transition(); // initializes members 'xi' and 'xv'
796 
797  switch (whichcat) {
798  case NONE:
799  case SIMILARITY:
800  case KIMURA:
801  fracchange = 1.0;
802  break;
803  case PAM:
804  code();
805  pameigen();
806  break;
807  default:
808  code();
809  maketrans();
810  qreigen(prob, 20L);
811  break;
812  }
813 }
static double freqa
#define NONE
Definition: GDE_def.h:34
static char * y[maxsp+1]
double di_aa_matrix[DI_MAX_AA][DI_MAX_AA]
Definition: di_protdist.hxx:28
const int DI_MAX_DIST
Definition: di_protdist.hxx:21
di_protdist(di_codetype codei, di_cattype cati, long nentries, DI_ENTRY **entries, long seq_len, AP_smatrix *matrixi)
const char * GBS_global_string(const char *templat,...)
Definition: arb_msg.cxx:204
static double freqc
static double freqgr
static double freqty
const int DI_RESOLUTION
Definition: di_protdist.hxx:20
unsigned char ap_pro
static double fracchange
int chars
Definition: seq_search.cxx:38
void GBK_terminate(const char *error) __ATTR__NORETURN
Definition: arb_msg.cxx:463
void set(size_t i, size_t j, AP_FLOAT val)
Definition: AP_matrix.hxx:43
static void error(const char *msg)
Definition: mkptypes.cxx:96
di_codetype
Definition: di_protdist.hxx:25
Definition: di_matr.hxx:75
di_cattype
Definition: di_protdist.hxx:26
const int DI_MAX_AA
Definition: di_protdist.hxx:18
static double freqy
GB_ERROR makedists(bool *aborted_flag)
const int DI_MAX_PAA
Definition: di_protdist.hxx:19
static double freqr
AP_sequence_simple_protein * get_prot_seq()
Definition: di_matr.hxx:87
static double ttratio
CONSTEXPR_INLINE long matrix_halfsize(long entries, bool inclusive_diagonal)
Definition: AP_matrix.hxx:23
#define epsilon
Definition: DI_protdist.cxx:17
#define di_assert(cond)
Definition: DI_clusters.cxx:31
void aw_message(const char *msg)
Definition: AW_status.cxx:932
#define NULp
Definition: cxxforward.h:97
const ap_pro * get_sequence() const
void inc_and_check_user_abort(GB_ERROR &error)
Definition: arb_progress.h:274
GB_write_int const char s
Definition: AW_awar.cxx:156