ARB
CT_part.cxx
Go to the documentation of this file.
1 // ============================================================= //
2 // //
3 // File : CT_part.cxx //
4 // Purpose : //
5 // //
6 // Institute of Microbiology (Technical University Munich) //
7 // http://www.arb-home.de/ //
8 // //
9 // ============================================================= //
10 
11 /* This module is designed to organize the data structure partitions
12  partitions represent the edges of a tree */
13 // the partitions are implemented as an array of longs
14 // Each leaf in a GBT-Tree is represented as one Bit in the Partition
15 
16 #include "CT_part.hxx"
17 #include "CT_common.hxx"
18 
19 #define BITS_PER_PELEM (sizeof(PELEM)*8)
20 
21 #if defined(DUMP_PART_INIT) || defined(UNIT_TESTS)
22 static const char *readable_cutmask(PELEM mask) {
23  static char readable[BITS_PER_PELEM+1];
24  memset(readable, '0', BITS_PER_PELEM);
25  readable[BITS_PER_PELEM] = 0;
26 
27  for (int b = BITS_PER_PELEM-1; b >= 0; --b) {
28  if (mask&1) readable[b] = '1';
29  mask = mask>>1;
30  }
31  return readable;
32 }
33 #endif
34 
36  : cutmask(0),
37  longs((((len + 7) / 8)+sizeof(PELEM)-1) / sizeof(PELEM)),
38  bits(len),
39  id(0)
40 {
47  int j = len % BITS_PER_PELEM;
48  if (!j) j += BITS_PER_PELEM;
49 
50  for (int i=0; i<j; i++) {
51  cutmask <<= 1;
52  cutmask |= 1;
53  }
54 
55 #if defined(DEBUG)
56  size_t possible = longs*BITS_PER_PELEM;
57  arb_assert((possible-bits)<BITS_PER_PELEM); // longs is too big (wasted space)
58 
59 #if defined(DUMP_PART_INIT)
60  printf("leafs=%i\n", len);
61  printf("cutmask='%s'\n", readable_cutmask(cutmask));
62  printf("longs=%i (can hold %zu bits)\n", longs, possible);
63  printf("bits=%i\n", bits);
64 #endif
65 #endif
66 }
67 
68 #if defined(NTREE_DEBUG_FUNCTIONS)
69 
70 static const CharPtrArray *namesPtr = NULp;
71 
72 void PART::start_pretty_printing(const CharPtrArray& names) { namesPtr = &names; }
73 void PART::stop_pretty_printing() { namesPtr = NULp; }
74 
75 void PART::print() const {
76  // ! Testfunction to print a part
77  int k = 0;
78  const int longs = get_longs();
79  const int plen = info->get_bits();
80 
81  if (namesPtr) {
82  const CharPtrArray& names = *namesPtr;
83  for (int part = 0; part<=1; ++part) {
84  // bool first = true;
85  for (int i=0; i<longs; i++) {
86  PELEM el = 1;
87  for (int j=0; k<plen && size_t(j)<sizeof(PELEM)*8; j++, k++) {
88  bool bitset = p[i] & el;
89  if (bitset == part) {
90  const char *name = names[k];
91 #if 1
92  fputc(name[0], stdout); // first char of name
93 #else
94  if (!first) fputc(',', stdout);
95  else first = false;
96  fputs(name, stdout); // full name
97 #endif
98  }
99  el <<= 1;
100  }
101  }
102  if (!part) {
103  fputs("---", stdout);
104  k = 0;
105  }
106  }
107  }
108  else {
109  for (int i=0; i<longs; i++) {
110  PELEM el = 1;
111  for (int j=0; k<plen && size_t(j)<sizeof(PELEM)*8; j++, k++) {
112  bool bitset = p[i] & el;
113  fputc('0'+bitset, stdout);
114  el <<= 1;
115  }
116  }
117  }
118 
119  printf(" len=%.5f prob=%5.1f%% w.len=%.5f leaf=%i dist2center=%i\n",
120  len, weight*100.0, get_len(), is_leaf_edge(), distance_to_tree_center());
121 }
122 #endif
123 
129  PART *p = new PART(this, 1.0);
130  p->invert();
131  arb_assert(p->is_valid());
132  return p;
133 }
134 
135 bool PART::overlaps_with(const PART *other) const {
139  arb_assert(is_valid());
140  arb_assert(other->is_valid());
141 
142  const int longs = get_longs();
143  for (int i=0; i<longs; i++) {
144  if (p[i] & other->p[i]) return true;
145  }
146  return false;
147 }
148 
149 void PART::invert() {
151  //
152  // Each edge in a tree connects two subtrees.
153  // These subtrees are represented by inverse partitions
154 
155  arb_assert(is_valid());
156 
157  const int longs = get_longs();
158  for (int i=0; i<longs; i++) { // LOOP_VECTORIZED
159  p[i] = ~p[i];
160  }
161  p[longs-1] &= get_cutmask();
162 
163  members = get_maxsize()-members;
164 
165  arb_assert(is_valid());
166 }
167 
168 void PART::invertInSuperset(const PART *superset) {
169  arb_assert(is_valid());
170  arb_assert(is_subset_of(superset));
171 
172  const int longs = get_longs();
173  for (int i=0; i<longs; i++) { // LOOP_VECTORIZED
174  p[i] = p[i] ^ superset->p[i];
175  }
176  p[longs-1] &= get_cutmask();
177 
178  members = superset->get_members()-members;
179 
180  arb_assert(is_valid());
181  arb_assert(is_subset_of(superset));
182 }
183 
184 
185 void PART::add_members_from(const PART *source) {
187  arb_assert(source->is_valid());
188  arb_assert(is_valid());
189 
190  bool distinct = disjunct_from(source);
191 
192  const int longs = get_longs();
193  for (int i=0; i<longs; i++) { // LOOP_VECTORIZED
194  p[i] |= source->p[i];
195  }
196 
197  if (distinct) {
198  members += source->members;
199  }
200  else {
201  members = count_members();
202  }
203 
204  arb_assert(is_valid());
205 }
206 
207 
208 bool PART::equals(const PART *other) const {
211  arb_assert(is_valid());
212  arb_assert(other->is_valid());
213 
214  const int longs = get_longs();
215  for (int i=0; i<longs; i++) {
216  if (p[i] != other->p[i]) return false;
217  }
218  return true;
219 }
220 
221 
222 unsigned PART::key() const {
224  arb_assert(is_valid());
225 
226  PELEM ph = 0;
227  const int longs = get_longs();
228  for (int i=0; i<longs; i++) { // LOOP_VECTORIZED
229  ph ^= p[i];
230  }
231 
232  return ph;
233 }
234 
235 inline uint8_t bytebitcount(uint8_t byte) {
236  uint8_t count = 0;
237  for (uint8_t b = 0; b<8; ++b) {
238  if (byte&1) ++count;
239  byte = byte>>1;
240  }
241  return count;
242 }
243 struct bitcounter {
244  uint8_t bytebits[256];
246  for (unsigned i = 0; i<256; ++i) {
247  bytebits[i] = bytebitcount(i);
248  }
249  }
250 };
251 
252 inline int bitcount(PELEM e) {
253  static bitcounter counted; // static lookup table
254 
255  int leafs = 0;
256 #if defined(DUMP_PART_DISTANCE)
257  fprintf(stdout, "bitcount(%04x) = ", e);
258 #endif
259  for (size_t bi = 0; bi<sizeof(e); ++bi) {
260  leafs += counted.bytebits[e&0xff];
261  e = e>>8;
262  }
263 #if defined(DUMP_PART_DISTANCE)
264  fprintf(stdout, "%i\n", leafs);
265 #endif
266  return leafs;
267 }
268 
269 int PART::count_members() const {
271  int leafs = 0;
272  const int longs = get_longs();
273  for (int i = 0; i<(longs-1); ++i) {
274  leafs += bitcount(p[i]);
275  }
276  leafs += bitcount(p[longs-1] & get_cutmask());
277  return leafs;
278 }
279 
280 bool PART::is_standardized() const { // @@@ inline
285  // may be any criteria which differs between PART and its inverse
286  // if you change the criteria, generated trees will change
287  // (because branch-insertion-order is affected)
288 
289  return bit_is_set(0);
290 }
291 
298  arb_assert(is_valid());
299  if (!is_standardized()) {
300  invert();
302  }
303  arb_assert(is_valid());
304 }
305 
306 int PART::index() const {
314  arb_assert(is_valid());
316 
317  int pos = 0;
318  const int longs = get_longs();
319  for (int i=0; i<longs; i++) {
320  PELEM p_temp = p[i];
321  pos = i * sizeof(PELEM) * 8;
322  if (p_temp) {
323  for (; p_temp; p_temp >>= 1, pos++) {
324  ;
325  }
326  break;
327  }
328  }
329  return pos-1;
330 }
331 
332 int PART::insertionOrder_cmp(const PART *other) const {
333  // defines order in which edges will be inserted into the consensus tree
334 
335  if (this == other) return 0;
336 
337  int cmp = is_leaf_edge() - other->is_leaf_edge();
338 
339  if (!cmp) {
340  cmp = -double_cmp(weight, other->weight); // insert bigger weight first
341  if (!cmp) {
342  int centerdist1 = distance_to_tree_center();
343  int centerdist2 = other->distance_to_tree_center();
344 
345  cmp = centerdist1-centerdist2; // insert central edges first
346 
347  if (!cmp) {
348  cmp = -double_cmp(get_len(), other->get_len()); // NOW REALLY insert bigger len first
349  // (change affected test results: increased in-tree-distance,
350  // but reduced parsimony value of result-trees)
351  if (!cmp) {
352  cmp = id - other->id; // strict by definition
353  arb_assert(cmp);
354  }
355  }
356  }
357  }
358 
359  return cmp;
360 }
361 inline int PELEM_cmp(const PELEM& p1, const PELEM& p2) {
362  return p1<p2 ? -1 : (p1>p2 ? 1 : 0);
363 }
364 
365 int PART::topological_cmp(const PART *other) const {
366  // define a strict order on topologies defined by edges
367 
368  if (this == other) return 0;
369 
371  arb_assert(other->is_standardized());
372 
373  int cmp = members - other->members;
374  if (!cmp) {
375  const int longs = get_longs();
376  for (int i = 0; !cmp && i<longs; ++i) {
377  cmp = PELEM_cmp(p[i], other->p[i]);
378  }
379  }
380 
381  arb_assert(contradicted(cmp, equals(other)));
382 
383  return cmp;
384 }
385 
386 #if defined(DUMP_PART_DISTANCE)
387 static void dumpbits(const PELEM p) {
388  PELEM el = 1;
389  for (int j=0; size_t(j)<sizeof(PELEM)*8; j++) {
390  bool bitset = p & el;
391  fputc("-1"[bitset], stdout);
392  el <<= 1;
393  }
394 }
395 #endif
396 
397 int PART::distanceTo(const PART *other, const PART *this_superset, const PART *other_superset) const {
415 #if defined(DUMP_PART_DISTANCE)
416  fputs("this: ", stdout); print();
417  fputs("other: ", stdout); other->print();
418  fputs("this_superset: ", stdout); this_superset->print();
419  fputs("other_superset:", stdout); other_superset->print();
420 #endif
421 
422 
423 #if defined(ASSERTION_USED)
424  if (this != this_superset) { // avoid that calls from calcTreeDistance fail here
425  if (!is_real_son_of(this_superset)) { // if 'this' is NOT inside tree 'this_superset' ...
426  PART *thisInverse = clone();
427  thisInverse->invert();
428  arb_assert(thisInverse->is_real_son_of(this_superset)); // assert inverse of 'this' is inside tree 'this_superset'
429  delete thisInverse;
430  }
431  }
432  if (other != other_superset) { // avoid that calls from calcTreeDistance fail here
433  if (!other->is_real_son_of(other_superset)) { // if 'other' is NOT inside tree 'other_superset' ...
434  PART *otherInverse = other->clone();
435  otherInverse->invert();
436  arb_assert(otherInverse->is_real_son_of(other_superset)); // assert inverse of 'other' is inside tree 'other_superset'
437  delete otherInverse;
438  }
439  }
440 #endif
441 
442  int dist = 0;
443 
444  const int longs = get_longs();
445  for (int i = 0; i<longs; ++i) {
446  PELEM ts = this_superset->p[i];
447  PELEM os = other_superset->p[i];
448 
449  if (i == (longs-1)) {
450  const PELEM cutmask = this_superset->get_cutmask(); // should be identical for all involved PARTs
451 
452  ts = ts & cutmask;
453  os = os & cutmask;
454  }
455 
456  const PELEM O = ts ^ os; // calculate superset difference
457  const PELEM si = ts & os; // calculate superset intersection
458 
459  const PELEM t0 = p[i] & si;
460  const PELEM o0 = other->p[i] & si;
461 
462  const PELEM ti = t0 ^ si; // like invertInSuperset, but only performed in superset intersection
463  const PELEM oi = o0 ^ si;
464 
465  // calculate all 4 possible difference-parts:
466  const PELEM d00 = t0 ^ o0; // union(t0, o0) - intersection(t0,o0)
467  const PELEM d0i = t0 ^ oi;
468  const PELEM di0 = ti ^ o0;
469  const PELEM dii = ti ^ oi;
470 
471  const int d1 = bitcount(d00) + bitcount(dii); // calculate absolute values and sum pairwise
472  const int d2 = bitcount(d0i) + bitcount(di0);
473 
474  const int idist = bitcount(O) + std::min(d1, d2); // calculate whole difference (of current PELEM)
475 
476 #if defined(DUMP_PART_DISTANCE)
477 
478 #define DUMPBITS(var) do { fprintf(stdout, "%5s = %04x = ", #var, var); dumpbits(var); fputc('\n', stdout); } while(0)
479 #define DUMPINT(var) fprintf(stdout, "%5s = %i\n", #var, var)
480 
481  DUMPINT(i);
482 
483  DUMPBITS(ts);
484  DUMPBITS(os);
485  DUMPBITS(t0);
486  DUMPBITS(o0);
487  DUMPBITS(ti);
488  DUMPBITS(oi);
489  DUMPBITS(O);
490  DUMPBITS(d00);
491  DUMPBITS(d0i);
492  DUMPBITS(di0);
493  DUMPBITS(dii);
494 
495  DUMPINT(d1);
496  DUMPINT(d2);
497  DUMPINT(idist);
498 #endif
499 
500  dist += idist; // sum up
501  }
502 
503 #if defined(DUMP_PART_DISTANCE)
504  fprintf(stdout, "resulting dist=%i\n", dist);
505 #endif
506 
507  return dist;
508 }
509 
510 int PART_FWD::calcDistance(const PART *e1, const PART *e2, const PART *t1, const PART *t2) {
520  return e1->distanceTo(e2, t1, t2);
521 }
522 
523 const TreeNode *PART_FWD::get_origin(const PART *part) {
524  return part ? part->get_origin() : NULp;
525 }
526 
527 int PART_FWD::get_members(const PART *part) {
528  return part->get_members();
529 }
530 
532  delete part;
533 }
534 
535 
536 // --------------------------------------------------------------------------------
537 
538 #ifdef UNIT_TESTS
539 #ifndef TEST_UNIT_H
540 #include <test_unit.h>
541 #endif
542 
543 void TEST_PartRegistry() {
544  {
545  PartitionSize reg(0);
546  TEST_EXPECT_EQUAL(reg.get_bits(), 0);
547  TEST_EXPECT_EQUAL(reg.get_longs(), 0);
548  // cutmask doesnt matter
549  }
550 
551  {
552  PartitionSize reg(1);
553  TEST_EXPECT_EQUAL(reg.get_bits(), 1);
554  TEST_EXPECT_EQUAL(reg.get_longs(), 1);
555  TEST_EXPECT_EQUAL(readable_cutmask(reg.get_cutmask()), "00000000000000000000000000000001");
556  }
557 
558  {
559  PartitionSize reg(31);
560  TEST_EXPECT_EQUAL(reg.get_bits(), 31);
561  TEST_EXPECT_EQUAL(reg.get_longs(), 1);
562  TEST_EXPECT_EQUAL(readable_cutmask(reg.get_cutmask()), "01111111111111111111111111111111");
563  }
564 
565  {
566  PartitionSize reg(32);
567  TEST_EXPECT_EQUAL(reg.get_bits(), 32);
568  TEST_EXPECT_EQUAL(reg.get_longs(), 1);
569  TEST_EXPECT_EQUAL(readable_cutmask(reg.get_cutmask()), "11111111111111111111111111111111");
570  }
571 
572  {
573  PartitionSize reg(33);
574  TEST_EXPECT_EQUAL(reg.get_bits(), 33);
575  TEST_EXPECT_EQUAL(reg.get_longs(), 2);
576  TEST_EXPECT_EQUAL(readable_cutmask(reg.get_cutmask()), "00000000000000000000000000000001");
577  }
578 
579  {
580  PartitionSize reg(95);
581  TEST_EXPECT_EQUAL(reg.get_bits(), 95);
582  TEST_EXPECT_EQUAL(reg.get_longs(), 3);
583  TEST_EXPECT_EQUAL(readable_cutmask(reg.get_cutmask()), "01111111111111111111111111111111");
584  }
585 }
586 
587 #endif // UNIT_TESTS
588 
589 // --------------------------------------------------------------------------------
590 
591 
#define arb_assert(cond)
Definition: arb_assert.h:245
int topological_cmp(const PART *other) const
Definition: CT_part.cxx:365
const char * id
Definition: AliAdmin.cxx:17
int PELEM_cmp(const PELEM &p1, const PELEM &p2)
Definition: CT_part.cxx:361
#define BITS_PER_PELEM
Definition: CT_part.cxx:19
const TreeNode * get_origin(const PART *part)
Definition: CT_part.cxx:523
PartitionSize(int len)
Definition: CT_part.cxx:35
int get_members() const
Definition: CT_part.hxx:225
bool is_leaf_edge() const
Definition: CT_part.hxx:228
unsigned int PELEM
Definition: CT_part.hxx:31
int get_maxsize() const
Definition: CT_part.hxx:121
void standardize()
Definition: CT_part.cxx:292
bool is_valid() const
Definition: CT_part.hxx:131
int get_members(const PART *part)
Definition: CT_part.cxx:527
PART * clone() const
Definition: CT_part.hxx:118
void invertInSuperset(const PART *superset)
Definition: CT_part.cxx:168
static FullNameMap names
int calcDistance(const PART *e1, const PART *e2, const PART *t1, const PART *t2)
Definition: CT_part.cxx:510
int index() const
Definition: CT_part.cxx:306
int get_longs() const
Definition: CT_part.hxx:120
uint8_t bytebitcount(uint8_t byte)
Definition: CT_part.cxx:235
unsigned key() const
Definition: CT_part.cxx:222
bool is_standardized() const
Definition: CT_part.cxx:280
fputc('\n', stderr)
PELEM get_cutmask() const
Definition: CT_part.hxx:122
PART * create_root() const
Definition: CT_part.cxx:124
bool bit_is_set(int pos) const
Definition: CT_part.hxx:156
int bitcount(PELEM e)
Definition: CT_part.cxx:252
#define cmp(h1, h2)
Definition: admap.cxx:50
bool disjunct_from(const PART *other) const
Definition: CT_part.hxx:217
bool equals(const PART *other) const
Definition: CT_part.cxx:208
void add_members_from(const PART *source)
Definition: CT_part.cxx:185
void invert()
Definition: CT_part.cxx:149
int distanceTo(const PART *other, const PART *this_superset, const PART *other_superset) const
Definition: CT_part.cxx:397
Definition: CT_part.hxx:62
uint8_t bytebits[256]
Definition: CT_part.cxx:244
fputs(TRACE_PREFIX, stderr)
str readable(const copy< T > &v)
Definition: test_unit.h:345
int insertionOrder_cmp(const PART *other) const
Definition: CT_part.cxx:332
bool is_real_son_of(const PART *father) const
Definition: CT_part.hxx:214
const TreeNode * get_origin() const
Definition: CT_part.hxx:129
#define NULp
Definition: cxxforward.h:116
bool is_subset_of(const PART *other) const
Definition: CT_part.hxx:205
int distance_to_tree_center() const
Definition: CT_part.hxx:229
bool overlaps_with(const PART *other) const
Definition: CT_part.cxx:135
int get_bits() const
Definition: CT_part.hxx:54
GBT_LEN get_len() const
Definition: CT_part.hxx:171
#define min(a, b)
Definition: f2c.h:153
CONSTEXPR_INLINE int double_cmp(const double d1, const double d2)
Definition: arbtools.h:185
void destroy_part(PART *part)
Definition: CT_part.cxx:531
#define DUMPINT(var)
#define TEST_EXPECT_EQUAL(expr, want)
Definition: test_unit.h:1294
void print(const T &t)
Definition: test_unit.h:359