ARB
NJ.cxx
Go to the documentation of this file.
1 // =============================================================== //
2 // //
3 // File : NJ.cxx //
4 // Purpose : //
5 // //
6 // Institute of Microbiology (Technical University Munich) //
7 // http://www.arb-home.de/ //
8 // //
9 // =============================================================== //
10 
11 #include "NJ.hxx"
12 #include <neighbourjoin.hxx>
13 #include <TreeNode.h>
14 #include <arb_diff.h>
15 #include <arb_progress.h>
16 
18  memset((char *)this, 0, sizeof(PH_NEIGHBOUR_DIST));
19 }
20 
21 
22 void PH_NEIGHBOURJOINING::remove_taxa_from_dist_list(long i) { // O(n/2)
23  long a, j;
25  for (a=0; a<swap_size; a++) {
26  j = swap_tab[a];
27  if (i==j) continue;
28  if (j<i) {
29  nd = &(dist_matrix[i][j]);
30  }
31  else {
32  nd = &(dist_matrix[j][i]);
33  }
34  nd->remove();
35  net_divergence[j] -= nd->val; // corr net divergence
36  }
37 }
38 void PH_NEIGHBOURJOINING::add_taxa_to_dist_list(long i) { // O(n/2)
39  long a;
40  long pos, j;
42  AP_FLOAT my_nd = 0.0;
43  for (a=0; a<swap_size; a++) {
44  j = swap_tab[a];
45  if (i==j) continue;
46  if (j<i) {
47  nd = &(dist_matrix[i][j]);
48  }
49  else {
50  nd = &(dist_matrix[j][i]);
51  }
52  ph_assert(!nd->previous);
53  pos = (int)(nd->val*dist_list_corr);
54  if (pos >= dist_list_size) {
55  pos = dist_list_size-1;
56  } else if (pos<0)
57  pos = 0;
58  nd->add(&(dist_list[pos]));
59 
60  net_divergence[j] += nd->val; // corr net divergence
61  my_nd += nd->val;
62  }
63  net_divergence[i] = my_nd;
64 }
65 
66 AP_FLOAT PH_NEIGHBOURJOINING::get_max_net_divergence() { // O(n/2)
67  long a, i;
68  AP_FLOAT max = 0.0;
69  for (a=0; a<swap_size; a++) {
70  i = swap_tab[a];
71  if (net_divergence[i] > max) max = net_divergence[i];
72  }
73  return max;
74 }
75 
76 void PH_NEIGHBOURJOINING::remove_taxa_from_swap_tab(long i) { // O(n/2)
77  long a;
78  long *source, *dest;
79  source = dest = swap_tab;
80  for (a=0; a<swap_size; a++) {
81  if (swap_tab[a] == i) {
82  source++;
83  }
84  else {
85  *(dest++) = *(source++);
86  }
87  }
88  swap_size --;
89 }
90 
92  size = smatrix.size();
93 
94  // init swap tab
95  swap_size = size;
96  swap_tab = new long[size];
97  for (long i=0; i<swap_size; i++) swap_tab[i] = i; // LOOP_VECTORIZED=2[!<5] // tested down to gcc 5.5.0 (fails on 4.9.2)
98 
99  ARB_calloc(net_divergence, size);
100 
101  dist_list_size = size; // hope to be the best
102  dist_list = new PH_NEIGHBOUR_DIST[dist_list_size]; // the roots, no elems
103  dist_list_corr = (dist_list_size-2.0)/smatrix.get_max_value();
104 
105  dist_matrix = new PH_NEIGHBOUR_DIST*[size];
106  for (long i=0; i<size; i++) {
107  dist_matrix[i] = new PH_NEIGHBOUR_DIST[i];
108  for (long j=0; j<i; j++) {
109  dist_matrix[i][j].val = smatrix.fast_get(i, j);
110  dist_matrix[i][j].i = i;
111  dist_matrix[i][j].j = j;
112  }
113  }
114  for (long i=0; i<size; i++) {
115  swap_size = i; // to calculate the correct net divergence..
116  add_taxa_to_dist_list(i); // ..add to dist list and add n.d.
117  }
118  swap_size = size;
119 }
120 
122  for (long i=0; i<size; i++) delete [] dist_matrix[i];
123  delete [] dist_matrix;
124  delete [] dist_list;
125  free(net_divergence);
126  delete [] swap_tab;
127 }
128 
129 AP_FLOAT PH_NEIGHBOURJOINING::get_min_ij(long& mini, long& minj) { // O(n*n/speedup)
130  // returns minval (only used by test inspection)
131 
132  AP_FLOAT maxri = get_max_net_divergence(); // O(n/2)
133  PH_NEIGHBOUR_DIST *dl;
134  long stat = 0;
135  AP_FLOAT x;
136  AP_FLOAT minval;
137  minval = 100000.0;
138  AP_FLOAT N_1 = 1.0/(swap_size-2.0);
139  maxri = maxri*2.0*N_1;
140  long pos;
141 
142  get_last_ij(mini, minj);
143 
144  for (pos=0; pos<dist_list_size; pos++) {
145  if (minval < pos/dist_list_corr - maxri) break;
146  // no way to get a better minimum
147  dl = dist_list[pos].next; // first entry does not contain information
148  for (; dl; dl=dl->next) {
149  x = (net_divergence[dl->i] + net_divergence[dl->j])*N_1;
150  if (dl->val-x<minval) {
151  minval = dl->val -x;
152  minj = dl->i;
153  mini = dl->j;
154  }
155  stat++;
156  }
157  }
158  return minval;
159 }
160 
161 void PH_NEIGHBOURJOINING::join_nodes(long i, long j, AP_FLOAT &leftl, AP_FLOAT& rightl) {
162  PH_NEIGHBOUR_DIST **d = dist_matrix;
163  AP_FLOAT dji;
164 
165  AP_FLOAT dist = get_dist(i, j);
166 
167  leftl = dist*.5 + (net_divergence[i] - net_divergence[j])*.5/
168  (swap_size - 2.0);
169  rightl = dist - leftl;
170 
171  remove_taxa_from_dist_list(j);
172  remove_taxa_from_swap_tab(j);
173  remove_taxa_from_dist_list(i);
174 
175  long a, k;
176  dji = d[j][i].val;
177  for (a=0; a<swap_size; a++) {
178  k = swap_tab[a];
179  if (k==i) continue; // k == j not possible
180  if (k>i) {
181  if (k>j) {
182  d[k][i].val = .5*(d[k][i].val + d[k][j].val - dji);
183  }
184  else {
185  d[k][i].val = .5*(d[k][i].val + d[j][k].val - dji);
186  }
187  }
188  else {
189  d[i][k].val = 0.5 * (d[i][k].val + d[j][k].val - dji);
190  }
191  }
192  add_taxa_to_dist_list(i);
193 }
194 
195 void PH_NEIGHBOURJOINING::get_last_ij(long& i, long& j) {
196  i = swap_tab[0];
197  j = swap_tab[1];
198 }
199 
201  return dist_matrix[j][i].val;
202 }
203 
204 TreeNode *neighbourjoining(const char *const *names, const AP_smatrix& smatrix, TreeRoot *troot, bool *aborted_flag) { // @@@ pass 'names' as ConstStrArray?
205  // structure_size >= sizeof(TreeNode);
206  // lower triangular matrix
207  // size: size of matrix
208 
209  const size_t msize = smatrix.size();
210 
211  // assuming algorithm ~ O(N^3) with speedup because N decrements with each join:
212  const size_t steps = sum_of_triangular_numbers(msize); // keep sync with ../../DIST/DI_matr.cxx@NJsteps
213 
214  arb_progress progress("neighbourjoining tree", steps);
215 
216  PH_NEIGHBOURJOINING nj(smatrix);
217  TreeNode **nodes = ARB_calloc<TreeNode*>(msize);
218 
219  for (size_t i=0; i<msize; i++) {
220  nodes[i] = troot->makeNode();
221  nodes[i]->name = strdup(names[i]);
222  nodes[i]->markAsLeaf();
223  }
224 
225  size_t rest_size = msize;
226  for (size_t i=2; i<msize && !progress.aborted(); i++) {
227  long a, b;
228  nj.get_min_ij(a, b);
229 
230  AP_FLOAT ll, rl;
231  nj.join_nodes(a, b, ll, rl);
232 
233  TreeNode *father = troot->makeNode();
234  father->leftson = nodes[a];
235  father->rightson = nodes[b];
236  father->leftlen = ll;
237  father->rightlen = rl;
238  nodes[a]->father = father;
239  nodes[b]->father = father;
240  nodes[a] = father;
241 
242  progress.inc_by(triangular_number(rest_size));
243  --rest_size;
244  }
245 
246  TreeNode *father = NULp;
247  if (progress.aborted()) {
248  if (aborted_flag) *aborted_flag = true;
249  }
250  else {
251  long a, b;
252  nj.get_last_ij(a, b);
253 
254  AP_FLOAT dist = nj.get_dist(a, b);
255 
256  AP_FLOAT ll = dist*0.5;
257  AP_FLOAT rl = dist*0.5;
258 
259  father = troot->makeNode();
260 
261  father->leftson = nodes[a];
262  father->rightson = nodes[b];
263 
264  father->leftlen = ll;
265  father->rightlen = rl;
266 
267  nodes[a]->father = father;
268  nodes[b]->father = father;
269 
270  free(nodes);
271 
272  father->get_tree_root()->change_root(NULp, father);
273  }
274 
275  // progress.dump();
276  progress.done(); // some steps are missing (e.g. 2nd loop starts at '2')
277 
278  return father;
279 }
280 
281 // --------------------------------------------------------------------------------
282 
283 #ifdef UNIT_TESTS
284 #ifndef TEST_UNIT_H
285 #include <test_unit.h>
286 #endif
287 
288 static const AP_FLOAT EPSILON = 0.0001;
289 
290 static arb_test::match_expectation min_ij_equals(PH_NEIGHBOURJOINING& nj, long expected_i, long expected_j, AP_FLOAT expected_minval) {
291  using namespace arb_test;
292  expectation_group expected;
293 
294  long i, j;
295  AP_FLOAT minval = nj.get_min_ij(i, j);
296 
297  expected.add(that(i).is_equal_to(expected_i));
298  expected.add(that(j).is_equal_to(expected_j));
299  expected.add(that(minval).fulfills(epsilon_similar(EPSILON), expected_minval));
300 
301  return all().ofgroup(expected);
302 }
303 
304 #define TEST_EXPECT_MIN_IJ(nj,i,j,minval) TEST_EXPECTATION(min_ij_equals(nj,i,j,minval))
305 
306 void TEST_neighbourjoining() {
307  const size_t SIZE = 4;
308 
309 #define TEST_FORWARD_ORDER // @@@ changing the order of nodes here changes the resulting trees
310  // i do not understand, if that means there is sth wrong or not..
311 
312 #if defined(TEST_FORWARD_ORDER)
313  const char *names[SIZE] = { "A", "B", "C", "D"};
314  enum { A, B, C, D };
315 #else // !defined(TEST_FORWARD_ORDER)
316  const char *names[SIZE] = { "D", "C", "B", "A"};
317  enum { D, C, B, A };
318 #endif
319 
320  for (int test = 1; test <= 2; ++test) {
321  AP_smatrix sym_matrix(SIZE);
322 
323  // Note: values used here are distances
324  for (size_t i = 0; i < SIZE; ++i) sym_matrix.set(i, i, 0.0);
325 
326  sym_matrix.set(A, B, 0.0765); sym_matrix.set(A, C, 0.1619); sym_matrix.set(A, D, 0.2266);
327  sym_matrix.set(B, C, 0.1278); sym_matrix.set(B, D, 0.2061);
328 
329  switch (test) {
330  case 1: sym_matrix.set(C, D, 0.1646); break;
331  case 2: sym_matrix.set(C, D, 0.30); break;
332  default: arb_assert(0); break;
333  }
334 
335  // check net_divergence values:
336  {
337  PH_NEIGHBOURJOINING nj(sym_matrix);
338 
339  TEST_EXPECT_SIMILAR(nj.get_net_divergence(A), 0.4650, EPSILON);
340  TEST_EXPECT_SIMILAR(nj.get_net_divergence(B), 0.4104, EPSILON);
341  switch (test) {
342  case 1:
343  TEST_EXPECT_SIMILAR(nj.get_net_divergence(C), 0.4543, EPSILON);
344  TEST_EXPECT_SIMILAR(nj.get_net_divergence(D), 0.5973, EPSILON);
345 
346 #define EXPECTED_MIN_IJ -0.361200
347 #if defined(TEST_FORWARD_ORDER)
348  TEST_EXPECT_MIN_IJ(nj, A, B, EXPECTED_MIN_IJ);
349 #else // !defined(TEST_FORWARD_ORDER)
350  TEST_EXPECT_MIN_IJ(nj, D, C, EXPECTED_MIN_IJ);
351 #endif
352 #undef EXPECTED_MIN_IJ
353  break;
354  case 2:
355  TEST_EXPECT_SIMILAR(nj.get_net_divergence(C), 0.5897, EPSILON);
356  TEST_EXPECT_SIMILAR(nj.get_net_divergence(D), 0.7327, EPSILON);
357 
358 #define EXPECTED_MIN_IJ -0.372250
359 #if defined(TEST_FORWARD_ORDER)
360 #if defined(ARB_64)
361  TEST_EXPECT_MIN_IJ(nj, B, C, EXPECTED_MIN_IJ);
362 #else // !defined(ARB_64)
363  TEST_EXPECT_MIN_IJ(nj, A, D, EXPECTED_MIN_IJ); // @@@ similar to 64-bit w/o TEST_FORWARD_ORDER
364 #endif
365 #else // !defined(TEST_FORWARD_ORDER)
366  TEST_EXPECT_MIN_IJ(nj, D, A, EXPECTED_MIN_IJ); // @@@ no differences between 32-/64-bit version w/o TEST_FORWARD_ORDER
367 #endif
368 #undef EXPECTED_MIN_IJ
369  break;
370  default: arb_assert(0); break;
371  }
372 
373  }
374 
375  TreeNode *tree = neighbourjoining(names, sym_matrix, new SimpleRoot, NULp);
376 
377  switch (test) {
378 #if defined(TEST_FORWARD_ORDER)
379 #if defined(ARB_64)
380  case 1: TEST_EXPECT_NEWICK(nSIMPLE, tree, "(((A,B),C),D);"); break;
381  case 2: TEST_EXPECT_NEWICK(nSIMPLE, tree, "((A,(B,C)),D);"); break;
382 #else // !defined(ARB_64)
383  // @@@ 32bit version behaves different
384  case 1: TEST_EXPECT_NEWICK(nSIMPLE, tree, "(((A,B),D),C);"); break;
385  case 2: TEST_EXPECT_NEWICK(nSIMPLE, tree, "(((A,D),B),C);"); break; // similar to 64-bit w/o TEST_FORWARD_ORDER
386 #endif
387 #else // !defined(TEST_FORWARD_ORDER)
388  // @@@ no differences between 32-/64-bit version w/o TEST_FORWARD_ORDER
389  case 1: TEST_EXPECT_NEWICK(nSIMPLE, tree, "(((D,C),A),B);"); break;
390  case 2: TEST_EXPECT_NEWICK(nSIMPLE, tree, "(((D,A),B),C);"); break;
391 #endif
392  default: arb_assert(0); break;
393  }
394 
395  destroy(tree);
396  }
397 }
398 TEST_PUBLISH(TEST_neighbourjoining);
399 
400 #endif // UNIT_TESTS
401 
402 // --------------------------------------------------------------------------------
#define arb_assert(cond)
Definition: arb_assert.h:245
void remove()
Definition: NJ.hxx:36
group_matcher all()
Definition: test_unit.h:1011
PH_NEIGHBOURJOINING(const AP_smatrix &smatrix)
Definition: NJ.cxx:91
#define TEST_EXPECT_SIMILAR(expr, want, epsilon)
Definition: test_unit.h:1298
T get_max_value() const
Definition: matrix.h:54
Definition: trnsprob.h:20
TreeRoot * get_tree_root() const
Definition: TreeNode.h:421
CONSTEXPR_INLINE long sum_of_triangular_numbers(const long L)
Definition: triangular.h:30
AP_FLOAT get_min_ij(long &i, long &j)
Definition: NJ.cxx:129
TreeNode * neighbourjoining(const char *const *names, const AP_smatrix &smatrix, TreeRoot *troot, bool *aborted_flag)
Definition: NJ.cxx:204
CONSTEXPR_INLINE long triangular_number(const long N)
Definition: triangular.h:18
GBT_LEN leftlen
Definition: TreeNode.h:172
TreeNode * rightson
Definition: TreeNode.h:171
double AP_FLOAT
Definition: AP_matrix.hxx:30
~PH_NEIGHBOURJOINING()
Definition: NJ.cxx:121
POS_TREE1 * father
Definition: probe_tree.h:39
const double EPSILON
Definition: aw_position.hxx:73
static FullNameMap names
#define TEST_PUBLISH(testfunction)
Definition: test_unit.h:1517
#define TEST_EXPECT_NEWICK(format, tree, expected_newick)
Definition: test_unit.h:1481
virtual TreeNode * makeNode() const =0
bool aborted()
Definition: arb_progress.h:335
static void * D
Definition: mem.cxx:18
TreeNode * father
Definition: TreeNode.h:171
#define SIZE
Definition: date.cxx:7
expectation_group & add(const expectation &e)
Definition: test_unit.h:812
AP_FLOAT val
Definition: NJ.hxx:30
#define that(thing)
Definition: test_unit.h:1043
#define ph_assert(cond)
Definition: phylo.hxx:24
TreeNode * leftson
Definition: TreeNode.h:171
GBT_LEN rightlen
Definition: TreeNode.h:172
#define is_equal_to(val)
Definition: test_unit.h:1025
PH_NEIGHBOUR_DIST()
Definition: NJ.cxx:17
PH_NEIGHBOUR_DIST * next
Definition: NJ.hxx:32
Definition: trnsprob.h:20
TYPE * ARB_calloc(size_t nelem)
Definition: arb_mem.h:81
void get_last_ij(long &i, long &j)
Definition: NJ.cxx:195
#define fulfills(pred, arg)
Definition: test_unit.h:1037
AP_FLOAT get_dist(long i, long j)
Definition: NJ.cxx:200
char * name
Definition: TreeNode.h:174
void join_nodes(long i, long j, AP_FLOAT &leftl, AP_FLOAT &rightlen)
Definition: NJ.cxx:161
PH_NEIGHBOUR_DIST * previous
Definition: NJ.hxx:32
void add(PH_NEIGHBOUR_DIST *root)
Definition: NJ.hxx:42
#define NULp
Definition: cxxforward.h:116
void markAsLeaf()
Definition: TreeNode.h:212
size_t size() const
Definition: matrix.h:66
void inc_by(PINT count)
Definition: arb_progress.h:361
void destroy(TreeNode *that)
Definition: TreeNode.h:600
virtual void change_root(TreeNode *old, TreeNode *newroot)
Definition: TreeNode.cxx:28
T fast_get(size_t i, size_t j) const
Definition: matrix.h:51
#define max(a, b)
Definition: f2c.h:154