ARB
NJ.cxx
Go to the documentation of this file.
1 // =============================================================== //
2 // //
3 // File : NJ.cxx //
4 // Purpose : //
5 // //
6 // Institute of Microbiology (Technical University Munich) //
7 // http://www.arb-home.de/ //
8 // //
9 // =============================================================== //
10 
11 #include "NJ.hxx"
12 #include <neighbourjoin.hxx>
13 #include <TreeNode.h>
14 #include <arb_diff.h>
15 
17  memset((char *)this, 0, sizeof(PH_NEIGHBOUR_DIST));
18 }
19 
20 
21 void PH_NEIGHBOURJOINING::remove_taxa_from_dist_list(long i) { // O(n/2)
22  long a, j;
24  for (a=0; a<swap_size; a++) {
25  j = swap_tab[a];
26  if (i==j) continue;
27  if (j<i) {
28  nd = &(dist_matrix[i][j]);
29  }
30  else {
31  nd = &(dist_matrix[j][i]);
32  }
33  nd->remove();
34  net_divergence[j] -= nd->val; // corr net divergence
35  }
36 }
37 void PH_NEIGHBOURJOINING::add_taxa_to_dist_list(long i) { // O(n/2)
38  long a;
39  long pos, j;
41  AP_FLOAT my_nd = 0.0;
42  for (a=0; a<swap_size; a++) {
43  j = swap_tab[a];
44  if (i==j) continue;
45  if (j<i) {
46  nd = &(dist_matrix[i][j]);
47  }
48  else {
49  nd = &(dist_matrix[j][i]);
50  }
51  ph_assert(!nd->previous);
52  pos = (int)(nd->val*dist_list_corr);
53  if (pos >= dist_list_size) {
54  pos = dist_list_size-1;
55  } else if (pos<0)
56  pos = 0;
57  nd->add(&(dist_list[pos]));
58 
59  net_divergence[j] += nd->val; // corr net divergence
60  my_nd += nd->val;
61  }
62  net_divergence[i] = my_nd;
63 }
64 
65 AP_FLOAT PH_NEIGHBOURJOINING::get_max_net_divergence() { // O(n/2)
66  long a, i;
67  AP_FLOAT max = 0.0;
68  for (a=0; a<swap_size; a++) {
69  i = swap_tab[a];
70  if (net_divergence[i] > max) max = net_divergence[i];
71  }
72  return max;
73 }
74 
75 void PH_NEIGHBOURJOINING::remove_taxa_from_swap_tab(long i) { // O(n/2)
76  long a;
77  long *source, *dest;
78  source = dest = swap_tab;
79  for (a=0; a<swap_size; a++) {
80  if (swap_tab[a] == i) {
81  source++;
82  }
83  else {
84  *(dest++) = *(source++);
85  }
86  }
87  swap_size --;
88 }
89 
91  size = smatrix.size();
92 
93  // init swap tab
94  swap_size = size;
95  swap_tab = new long[size];
96  for (long i=0; i<swap_size; i++) swap_tab[i] = i; // LOOP_VECTORIZED=2[!<5] // tested down to gcc 5.5.0 (fails on 4.9.2)
97 
98  ARB_calloc(net_divergence, size);
99 
100  dist_list_size = size; // hope to be the best
101  dist_list = new PH_NEIGHBOUR_DIST[dist_list_size]; // the roots, no elems
102  dist_list_corr = (dist_list_size-2.0)/smatrix.get_max_value();
103 
104  dist_matrix = new PH_NEIGHBOUR_DIST*[size];
105  for (long i=0; i<size; i++) {
106  dist_matrix[i] = new PH_NEIGHBOUR_DIST[i];
107  for (long j=0; j<i; j++) {
108  dist_matrix[i][j].val = smatrix.fast_get(i, j);
109  dist_matrix[i][j].i = i;
110  dist_matrix[i][j].j = j;
111  }
112  }
113  for (long i=0; i<size; i++) {
114  swap_size = i; // to calculate the correct net divergence..
115  add_taxa_to_dist_list(i); // ..add to dist list and add n.d.
116  }
117  swap_size = size;
118 }
119 
121  for (long i=0; i<size; i++) delete [] dist_matrix[i];
122  delete [] dist_matrix;
123  delete [] dist_list;
124  free(net_divergence);
125  delete [] swap_tab;
126 }
127 
128 AP_FLOAT PH_NEIGHBOURJOINING::get_min_ij(long& mini, long& minj) { // O(n*n/speedup)
129  // returns minval (only used by test inspection)
130 
131  AP_FLOAT maxri = get_max_net_divergence(); // O(n/2)
132  PH_NEIGHBOUR_DIST *dl;
133  long stat = 0;
134  AP_FLOAT x;
135  AP_FLOAT minval;
136  minval = 100000.0;
137  AP_FLOAT N_1 = 1.0/(swap_size-2.0);
138  maxri = maxri*2.0*N_1;
139  long pos;
140 
141  get_last_ij(mini, minj);
142 
143  for (pos=0; pos<dist_list_size; pos++) {
144  if (minval < pos/dist_list_corr - maxri) break;
145  // no way to get a better minimum
146  dl = dist_list[pos].next; // first entry does not contain information
147  for (; dl; dl=dl->next) {
148  x = (net_divergence[dl->i] + net_divergence[dl->j])*N_1;
149  if (dl->val-x<minval) {
150  minval = dl->val -x;
151  minj = dl->i;
152  mini = dl->j;
153  }
154  stat++;
155  }
156  }
157  return minval;
158 }
159 
160 void PH_NEIGHBOURJOINING::join_nodes(long i, long j, AP_FLOAT &leftl, AP_FLOAT& rightl) {
161  PH_NEIGHBOUR_DIST **d = dist_matrix;
162  AP_FLOAT dji;
163 
164  AP_FLOAT dist = get_dist(i, j);
165 
166  leftl = dist*.5 + (net_divergence[i] - net_divergence[j])*.5/
167  (swap_size - 2.0);
168  rightl = dist - leftl;
169 
170  remove_taxa_from_dist_list(j);
171  remove_taxa_from_swap_tab(j);
172  remove_taxa_from_dist_list(i);
173 
174  long a, k;
175  dji = d[j][i].val;
176  for (a=0; a<swap_size; a++) {
177  k = swap_tab[a];
178  if (k==i) continue; // k == j not possible
179  if (k>i) {
180  if (k>j) {
181  d[k][i].val = .5*(d[k][i].val + d[k][j].val - dji);
182  }
183  else {
184  d[k][i].val = .5*(d[k][i].val + d[j][k].val - dji);
185  }
186  }
187  else {
188  d[i][k].val = 0.5 * (d[i][k].val + d[j][k].val - dji);
189  }
190  }
191  add_taxa_to_dist_list(i);
192 }
193 
194 void PH_NEIGHBOURJOINING::get_last_ij(long& i, long& j) {
195  i = swap_tab[0];
196  j = swap_tab[1];
197 }
198 
200  return dist_matrix[j][i].val;
201 }
202 
203 TreeNode *neighbourjoining(const char *const *names, const AP_smatrix& smatrix, TreeRoot *troot) { // @@@ pass ConstStrArray
204  // structure_size >= sizeof(TreeNode);
205  // lower triangular matrix
206  // size: size of matrix
207 
208  PH_NEIGHBOURJOINING nj(smatrix);
209  TreeNode **nodes = ARB_calloc<TreeNode*>(smatrix.size());
210 
211  for (size_t i=0; i<smatrix.size(); i++) {
212  nodes[i] = troot->makeNode();
213  nodes[i]->name = strdup(names[i]);
214  nodes[i]->markAsLeaf();
215  }
216 
217  for (size_t i=0; i<smatrix.size()-2; i++) {
218  long a, b;
219  nj.get_min_ij(a, b);
220 
221  AP_FLOAT ll, rl;
222  nj.join_nodes(a, b, ll, rl);
223 
224  TreeNode *father = troot->makeNode();
225  father->leftson = nodes[a];
226  father->rightson = nodes[b];
227  father->leftlen = ll;
228  father->rightlen = rl;
229  nodes[a]->father = father;
230  nodes[b]->father = father;
231  nodes[a] = father;
232  }
233 
234  {
235  long a, b;
236  nj.get_last_ij(a, b);
237 
238  AP_FLOAT dist = nj.get_dist(a, b);
239 
240  AP_FLOAT ll = dist*0.5;
241  AP_FLOAT rl = dist*0.5;
242 
243  TreeNode *father = troot->makeNode();
244 
245  father->leftson = nodes[a];
246  father->rightson = nodes[b];
247 
248  father->leftlen = ll;
249  father->rightlen = rl;
250 
251  nodes[a]->father = father;
252  nodes[b]->father = father;
253 
254  free(nodes);
255 
256  father->get_tree_root()->change_root(NULp, father);
257  return father;
258  }
259 }
260 
261 // --------------------------------------------------------------------------------
262 
263 #ifdef UNIT_TESTS
264 #ifndef TEST_UNIT_H
265 #include <test_unit.h>
266 #endif
267 
268 static const AP_FLOAT EPSILON = 0.0001;
269 
270 static arb_test::match_expectation min_ij_equals(PH_NEIGHBOURJOINING& nj, long expected_i, long expected_j, AP_FLOAT expected_minval) {
271  using namespace arb_test;
272  expectation_group expected;
273 
274  long i, j;
275  AP_FLOAT minval = nj.get_min_ij(i, j);
276 
277  expected.add(that(i).is_equal_to(expected_i));
278  expected.add(that(j).is_equal_to(expected_j));
279  expected.add(that(minval).fulfills(epsilon_similar(EPSILON), expected_minval));
280 
281  return all().ofgroup(expected);
282 }
283 
284 #define TEST_EXPECT_MIN_IJ(nj,i,j,minval) TEST_EXPECTATION(min_ij_equals(nj,i,j,minval))
285 
286 void TEST_neighbourjoining() {
287  const size_t SIZE = 4;
288 
289 #define TEST_FORWARD_ORDER // @@@ changing the order of nodes here changes the resulting trees
290  // i do not understand, if that means there is sth wrong or not..
291 
292 #if defined(TEST_FORWARD_ORDER)
293  const char *names[SIZE] = { "A", "B", "C", "D"};
294  enum { A, B, C, D };
295 #else // !defined(TEST_FORWARD_ORDER)
296  const char *names[SIZE] = { "D", "C", "B", "A"};
297  enum { D, C, B, A };
298 #endif
299 
300  for (int test = 1; test <= 2; ++test) {
301  AP_smatrix sym_matrix(SIZE);
302 
303  // Note: values used here are distances
304  for (size_t i = 0; i < SIZE; ++i) sym_matrix.set(i, i, 0.0);
305 
306  sym_matrix.set(A, B, 0.0765); sym_matrix.set(A, C, 0.1619); sym_matrix.set(A, D, 0.2266);
307  sym_matrix.set(B, C, 0.1278); sym_matrix.set(B, D, 0.2061);
308 
309  switch (test) {
310  case 1: sym_matrix.set(C, D, 0.1646); break;
311  case 2: sym_matrix.set(C, D, 0.30); break;
312  default: arb_assert(0); break;
313  }
314 
315  // check net_divergence values:
316  {
317  PH_NEIGHBOURJOINING nj(sym_matrix);
318 
319  TEST_EXPECT_SIMILAR(nj.get_net_divergence(A), 0.4650, EPSILON);
320  TEST_EXPECT_SIMILAR(nj.get_net_divergence(B), 0.4104, EPSILON);
321  switch (test) {
322  case 1:
323  TEST_EXPECT_SIMILAR(nj.get_net_divergence(C), 0.4543, EPSILON);
324  TEST_EXPECT_SIMILAR(nj.get_net_divergence(D), 0.5973, EPSILON);
325 
326 #define EXPECTED_MIN_IJ -0.361200
327 #if defined(TEST_FORWARD_ORDER)
328  TEST_EXPECT_MIN_IJ(nj, A, B, EXPECTED_MIN_IJ);
329 #else // !defined(TEST_FORWARD_ORDER)
330  TEST_EXPECT_MIN_IJ(nj, D, C, EXPECTED_MIN_IJ);
331 #endif
332 #undef EXPECTED_MIN_IJ
333  break;
334  case 2:
335  TEST_EXPECT_SIMILAR(nj.get_net_divergence(C), 0.5897, EPSILON);
336  TEST_EXPECT_SIMILAR(nj.get_net_divergence(D), 0.7327, EPSILON);
337 
338 #define EXPECTED_MIN_IJ -0.372250
339 #if defined(TEST_FORWARD_ORDER)
340 #if defined(ARB_64)
341  TEST_EXPECT_MIN_IJ(nj, B, C, EXPECTED_MIN_IJ);
342 #else // !defined(ARB_64)
343  TEST_EXPECT_MIN_IJ(nj, A, D, EXPECTED_MIN_IJ); // @@@ similar to 64-bit w/o TEST_FORWARD_ORDER
344 #endif
345 #else // !defined(TEST_FORWARD_ORDER)
346  TEST_EXPECT_MIN_IJ(nj, D, A, EXPECTED_MIN_IJ); // @@@ no differences between 32-/64-bit version w/o TEST_FORWARD_ORDER
347 #endif
348 #undef EXPECTED_MIN_IJ
349  break;
350  default: arb_assert(0); break;
351  }
352 
353  }
354 
355  TreeNode *tree = neighbourjoining(names, sym_matrix, new SimpleRoot);
356 
357  switch (test) {
358 #if defined(TEST_FORWARD_ORDER)
359 #if defined(ARB_64)
360  case 1: TEST_EXPECT_NEWICK(nSIMPLE, tree, "(((A,B),C),D);"); break;
361  case 2: TEST_EXPECT_NEWICK(nSIMPLE, tree, "((A,(B,C)),D);"); break;
362 #else // !defined(ARB_64)
363  // @@@ 32bit version behaves different
364  case 1: TEST_EXPECT_NEWICK(nSIMPLE, tree, "(((A,B),D),C);"); break;
365  case 2: TEST_EXPECT_NEWICK(nSIMPLE, tree, "(((A,D),B),C);"); break; // similar to 64-bit w/o TEST_FORWARD_ORDER
366 #endif
367 #else // !defined(TEST_FORWARD_ORDER)
368  // @@@ no differences between 32-/64-bit version w/o TEST_FORWARD_ORDER
369  case 1: TEST_EXPECT_NEWICK(nSIMPLE, tree, "(((D,C),A),B);"); break;
370  case 2: TEST_EXPECT_NEWICK(nSIMPLE, tree, "(((D,A),B),C);"); break;
371 #endif
372  default: arb_assert(0); break;
373  }
374 
375  destroy(tree);
376  }
377 }
378 TEST_PUBLISH(TEST_neighbourjoining);
379 
380 #endif // UNIT_TESTS
381 
382 // --------------------------------------------------------------------------------
#define arb_assert(cond)
Definition: arb_assert.h:245
void remove()
Definition: NJ.hxx:36
group_matcher all()
Definition: test_unit.h:1000
PH_NEIGHBOURJOINING(const AP_smatrix &smatrix)
Definition: NJ.cxx:90
AP_FLOAT get_max_value() const
Definition: AP_matrix.cxx:43
#define TEST_EXPECT_SIMILAR(expr, want, epsilon)
Definition: test_unit.h:1287
Definition: trnsprob.h:20
TreeRoot * get_tree_root() const
Definition: TreeNode.h:380
AP_FLOAT get_min_ij(long &i, long &j)
Definition: NJ.cxx:128
GBT_LEN leftlen
Definition: TreeNode.h:132
TreeNode * rightson
Definition: TreeNode.h:131
double AP_FLOAT
Definition: AP_matrix.hxx:27
~PH_NEIGHBOURJOINING()
Definition: NJ.cxx:120
POS_TREE1 * father
Definition: probe_tree.h:39
const double EPSILON
Definition: aw_position.hxx:73
static FullNameMap names
#define TEST_PUBLISH(testfunction)
Definition: test_unit.h:1485
#define TEST_EXPECT_NEWICK(format, tree, expected_newick)
Definition: test_unit.h:1449
virtual TreeNode * makeNode() const =0
static void * D
Definition: mem.cxx:18
TreeNode * father
Definition: TreeNode.h:131
TreeNode * neighbourjoining(const char *const *names, const AP_smatrix &smatrix, TreeRoot *troot)
Definition: NJ.cxx:203
#define SIZE
Definition: date.cxx:7
expectation_group & add(const expectation &e)
Definition: test_unit.h:801
AP_FLOAT val
Definition: NJ.hxx:30
#define that(thing)
Definition: test_unit.h:1032
#define ph_assert(cond)
Definition: phylo.hxx:24
TreeNode * leftson
Definition: TreeNode.h:131
GBT_LEN rightlen
Definition: TreeNode.h:132
#define is_equal_to(val)
Definition: test_unit.h:1014
PH_NEIGHBOUR_DIST()
Definition: NJ.cxx:16
PH_NEIGHBOUR_DIST * next
Definition: NJ.hxx:32
Definition: trnsprob.h:20
TYPE * ARB_calloc(size_t nelem)
Definition: arb_mem.h:81
void get_last_ij(long &i, long &j)
Definition: NJ.cxx:194
#define fulfills(pred, arg)
Definition: test_unit.h:1026
AP_FLOAT get_dist(long i, long j)
Definition: NJ.cxx:199
char * name
Definition: TreeNode.h:134
void join_nodes(long i, long j, AP_FLOAT &leftl, AP_FLOAT &rightlen)
Definition: NJ.cxx:160
PH_NEIGHBOUR_DIST * previous
Definition: NJ.hxx:32
void add(PH_NEIGHBOUR_DIST *root)
Definition: NJ.hxx:42
AP_FLOAT fast_get(size_t i, size_t j) const
Definition: AP_matrix.hxx:45
#define NULp
Definition: cxxforward.h:97
void markAsLeaf()
Definition: TreeNode.h:172
void destroy(TreeNode *that)
Definition: TreeNode.h:559
virtual void change_root(TreeNode *old, TreeNode *newroot)
Definition: TreeNode.cxx:28
size_t size() const
Definition: AP_matrix.hxx:50
#define max(a, b)
Definition: f2c.h:154