ARB
PT_partition.h
Go to the documentation of this file.
1 // =============================================================== //
2 // //
3 // File : PT_partition.h //
4 // Purpose : //
5 // //
6 // Coded by Ralf Westram (coder@reallysoft.de) in October 2012 //
7 // Institute of Microbiology (Technical University Munich) //
8 // http://www.arb-home.de/ //
9 // //
10 // =============================================================== //
11 
12 #ifndef PT_PARTITION_H
13 #define PT_PARTITION_H
14 
15 #ifndef PT_PREFIXITER_H
16 #include "PT_prefixIter.h"
17 #endif
18 #ifndef _GLIBCXX_CMATH
19 #include <cmath>
20 #endif
21 
22 // --------------------------------------------------------------------------------
23 // central settings for memory estimations
24 //
25 
26 #ifdef ARB_64
27 
28 # define STAGE1_INDEX_BYTES_PER_PASS_OLIGO 3.7 // size of index (for each oligo inserted in one pass)
29 # define STAGE1_INDEX_BYTES_PER_BASE 0.1 // size of index (for each bp in database)
30 # define STAGE1_OTHER_BYTES_PER_PASS_OLIGO 1.0 // non-index data (for each oligo inserted in one pass)
31 # define STAGE1_OTHER_BYTES_PER_BASE 1.15 // non-index data (for each bp in database)
32 
33 # define STAGE1_INDEX_EXTRA_MB 350 // additional constant memory used by index (+ a bit safety)
34 # define STAGE1_OTHER_EXTRA_MB 80 // additional constant memory used elsewhere (+ a bit safety)
35 
36 #else
37 
38 # define STAGE1_INDEX_BYTES_PER_PASS_OLIGO 3.2 // size of index (for each oligo inserted in one pass)
39 # define STAGE1_INDEX_BYTES_PER_BASE 0 // size of index (for each bp in database)
40 # define STAGE1_OTHER_BYTES_PER_PASS_OLIGO 0.7 // non-index data (for each oligo inserted in one pass)
41 # define STAGE1_OTHER_BYTES_PER_BASE 1 // non-index data (for each bp in database)
42 
43 # define STAGE1_INDEX_EXTRA_MB 150 // additional constant memory used by index (+ a bit safety)
44 # define STAGE1_OTHER_EXTRA_MB 50 // additional constant memory used elsewhere (+ a bit safety)
45 
46 #endif
47 
48 # define PTSERVER_BIN_MB 20 // binary mem footprint of ptserver (incl. libs, w/o DB) detected using pmap
49 
50 #define STAGE1_BYTES_PER_PASS_OLIGO (STAGE1_INDEX_BYTES_PER_PASS_OLIGO + STAGE1_OTHER_BYTES_PER_PASS_OLIGO)
51 #define STAGE1_BYTES_PER_BASE (STAGE1_INDEX_BYTES_PER_BASE + STAGE1_OTHER_BYTES_PER_BASE)
52 #define STAGE1_EXTRA_MB (STAGE1_INDEX_EXTRA_MB + STAGE1_OTHER_EXTRA_MB)
53 
55  return ULONG(STAGE1_BYTES_PER_PASS_OLIGO * partition_bp / 1024.0 +
56  STAGE1_BYTES_PER_BASE * all_bp / 1024.0 +
57  STAGE1_EXTRA_MB * 1024 +
58  0.5);
59 }
60 
61 static double base_probability(char base) {
62  pt_assert(base >= PT_QU && base < PT_BASES);
63  static double bprob[PT_BASES] = {
64  // probabilities are taken from silva-108-SSU-ref
65  0.0014, // PT_QU
66  0.0003, // PT_N
67  0.2543, // PT_A
68  0.2268, // PT_C
69  0.3074, // PT_G
70  0.2098, // PT_T
71  };
72  return bprob[safeCharIndex(base)];
73 }
74 
75 inline double calc_probability(const char *prefix, size_t length) {
76  double prob = 1.0;
77  for (size_t i = 0; i<length; ++i) {
78  prob *= base_probability(prefix[i]);
79  }
80  return prob;
81 }
82 
84  int depth;
85  int prefixes;
86  double *left_prob;
87 
88 public:
89  PrefixProbabilities(int depth_)
90  : depth(depth_)
91  {
92  PrefixIterator prefix(PT_QU, PT_T, depth);
93  prefixes = prefix.steps();
94 
95  left_prob = new double[prefixes+1];
96  left_prob[0] = 0.0;
97 
98  double prob[prefixes];
99 
100  int i = 0;
101  while (!prefix.done()) {
102  pt_assert(i<prefixes);
103  prob[i] = calc_probability(prefix.prefix(), prefix.length());
104  left_prob[i+1] = left_prob[i]+prob[i];
105  ++i;
106  ++prefix;
107  }
108  }
110  : depth(other.depth),
111  prefixes(other.prefixes)
112  {
113  left_prob = new double[prefixes+1];
114  memcpy(left_prob, other.left_prob, sizeof(*left_prob)*(prefixes+1));
115  }
117  ~PrefixProbabilities() { delete [] left_prob; }
118 
119  int get_prefix_count() const { return prefixes; }
120  int get_depth() const { return depth; }
121 
122  double left_of(int prefix_idx) const {
123  pt_assert(prefix_idx >= 0 && prefix_idx <= prefixes); // index must be in range [0 ... prefixes]
124  return left_prob[prefix_idx];
125  }
126  double of_range(int first_idx, int last_idx) const {
127  pt_assert(first_idx <= last_idx);
128  pt_assert(first_idx >= 0 && last_idx<prefixes); // index must be in range [0 ... prefixes-1]
129 
130  return left_of(last_idx+1) - left_of(first_idx);
131  }
132  double of(int prefix_idx) {
133  pt_assert(prefix_idx >= 0 && prefix_idx<prefixes); // index must be in range [0 ... prefixes-1]
134  return of_range(prefix_idx, prefix_idx);
135  }
136 
137  int find_index_near_leftsum(double wanted) const {
138  // returned index is in range [0 ... prefixes]
139 
140  int best_idx;
141  if (prefixes == 0) best_idx = 0;
142  else {
143  int low = 0;
144  int high = prefixes;
145 
146  double lol = left_of(low);
147  if (lol >= wanted) best_idx = low;
148  else {
149  double loh = left_of(high);
150  if (loh<wanted) best_idx = high;
151  else {
152  while ((low+1)<high) {
153  pt_assert(lol<wanted && wanted<=loh);
154 
155  int mid = (low+high)/2;
156  pt_assert(mid != low && mid != high);
157 
158  double left_of_mid = left_of(mid);
159  if (left_of_mid<wanted) {
160  low = mid;
161  lol = left_of_mid;
162  }
163  else {
164  high = mid;
165  loh = left_of_mid;
166  }
167  }
168  best_idx = fabs(lol-wanted) < fabs(loh-wanted) ? low : high;
169  }
170  }
171  }
172  return best_idx;
173  }
174 };
175 
176 class DecisionTree : virtual Noncopyable {
177  // associate bool-values with probe-prefixes
178 
179  bool decision;
180  bool decided;
181  DecisionTree *child[PT_BASES]; // all NULp if decided
182 
183  void forgetChilds() {
184  for (int i = PT_QU; i<PT_BASES; ++i) {
185  delete child[i];
186  child[i] = NULp;
187  }
188  }
189 
190 public:
192  : decided(false)
193  {
194  for (int i = PT_QU; i<PT_BASES; ++i) {
195  child[i] = NULp;
196  }
197  }
199  forgetChilds();
200  }
201 
202  void setValue(const char *probe, size_t len, bool wanted) {
203  if (len>0) {
204  DecisionTree*& probe_child = child[safeCharIndex(*probe)];
205  if (!probe_child) probe_child = new DecisionTree;
206  probe_child->setValue(probe+1, len-1, wanted);
207  }
208  else {
209  decision = wanted;
210  decided = true;
211  }
212  }
213 
214  bool getValue(const char *probe) const {
215  if (decided) return decision;
216  return child[safeCharIndex(*probe)]->getValue(probe+1);
217  }
218 
219  void optimize() {
220  if (!decided) {
221  child[PT_QU]->optimize();
222  bool concordant = child[PT_QU]->decided;
223  bool common_decision = concordant ? child[PT_QU]->decision : false;
224 
225  for (int i = PT_QU+1; i<PT_BASES; ++i) {
226  child[i]->optimize();
227  if (concordant) {
228  if (child[i]->decided) {
229  if (child[i]->decision != common_decision) {
230  concordant = false;
231  }
232  }
233  else {
234  concordant = false;
235  }
236  }
237  }
238 
239  if (concordant) {
240  forgetChilds();
241  decided = true;
242  decision = common_decision;
243  }
244  }
245  }
246 };
247 
249  int depth;
250  int max_partitions;
251  bool *marked;
252 
253  DecisionTree *decision;
254 
255  void forget_decision_tree() {
256  delete decision;
257  decision = NULp;
258  }
259 
260 public:
261  MarkedPrefixes(int depth_)
262  : depth(depth_),
263  max_partitions(PrefixIterator(PT_QU, PT_T, depth).steps()),
264  marked(new bool[max_partitions]),
265  decision(NULp)
266  {
267  clearMarks();
268  }
270  : depth(other.depth),
271  max_partitions(other.max_partitions),
272  marked(new bool[max_partitions]),
273  decision(NULp)
274  {
275  memcpy(marked, other.marked, max_partitions*sizeof(*marked));
276  }
279  delete [] marked;
280  delete decision;
281  }
282 
283  void mark(int idx1, int idx2) {
284  pt_assert(idx1 <= idx2);
285  pt_assert(idx1 >= 0);
286  pt_assert(idx2 < max_partitions);
287 
288  for (int p = idx1; p <= idx2; ++p) {
289  marked[p] = true;
290  }
291  }
292  void clearMarks() {
293  for (int p = 0; p<max_partitions; ++p) {
294  marked[p] = false;
295  }
296  }
297 
298  void predecide() {
299  forget_decision_tree();
300  decision = new DecisionTree;
301 
302  PrefixIterator iter(PT_QU, PT_T, depth);
303  int idx = 0;
304 
305  while (!iter.done()) {
306  pt_assert(idx<max_partitions);
307  decision->setValue(iter.prefix(), iter.length(), marked[idx]);
308  ++iter;
309  ++idx;
310  }
311  decision->optimize();
312  }
313 
314 
315  bool isMarked(const char *probe) const {
316  pt_assert(decision); // predecide() not called
317  return decision->getValue(probe);
318  }
319 };
320 
321 class Partition {
322  PrefixProbabilities prob;
323 
324  int passes;
325  int current_pass;
326 
327  MarkedPrefixes prefix;
328 
329  int *start_of_pass; // contains prefix index
330 
331  mutable size_t max_probes_in_any_pass;
332 
333  int first_index_of_pass(int pass) const {
334  pt_assert(legal_pass(pass));
335  return start_of_pass[pass-1];
336  }
337  int last_index_of_pass(int pass) const {
338  pt_assert(legal_pass(pass));
339  return start_of_pass[pass]-1;
340  }
341 
342  bool have_zero_prob_passes() {
343  for (int p = 1; p <= passes; ++p) {
344  if (pass_probability(p) <= 0.0) return true;
345  }
346  return false;
347  }
348 
349  void calculate_pass_starts() {
350  pt_assert(passes <= max_allowed_passes());
351  delete [] start_of_pass;
352  start_of_pass = new int[passes+1];
353 
354  double prob_per_pass = 1.0/passes;
355 
356  start_of_pass[0] = 0;
357  for (int p = 1; p < passes; ++p) {
358  double pass_prob = p*prob_per_pass;
359  start_of_pass[p] = prob.find_index_near_leftsum(pass_prob);
360  }
361  start_of_pass[passes] = max_allowed_passes();
362 
363  // ensure there are no empty passes:
364  for (int p = 0; p<passes; ++p) {
365  if (start_of_pass[p] >= start_of_pass[p+1]) {
366  int nidx = start_of_pass[p]+1;
367  if (nidx<max_allowed_passes()) {
368  start_of_pass[p+1] = nidx;
369  }
370  }
371  }
372  for (int p = passes-1; p >= 0; --p) {
373  if (start_of_pass[p] >= start_of_pass[p+1]) {
374  start_of_pass[p] = start_of_pass[p+1]-1;
375  pt_assert(start_of_pass[p] >= 0);
376  }
377  }
378 
379  pt_assert(!have_zero_prob_passes());
380  }
381 
382  void markPrefixes() {
383  pt_assert(!done());
384 
385  prefix.clearMarks();
386  prefix.mark(first_index_of_pass(current_pass), last_index_of_pass(current_pass));
387  prefix.predecide();
388  }
389 
390  bool legal_pass(int pass) const { return pass >= 1 && pass <= passes; }
391 
392 public:
393  Partition(const PrefixProbabilities& prob_, int passes_)
394  : prob(prob_),
395  passes(passes_),
396  prefix(prob.get_depth()),
397  start_of_pass(new int[passes+1]),
398  max_probes_in_any_pass(0)
399  {
400  pt_assert(passes >= 1 && passes <= max_allowed_passes());
401  calculate_pass_starts();
402  reset();
403  }
404  Partition(const Partition& other)
405  : prob(other.prob),
406  passes(other.passes),
407  current_pass(other.current_pass),
408  prefix(other.prefix),
409  start_of_pass(new int[passes+1]),
410  max_probes_in_any_pass(other.max_probes_in_any_pass)
411  {
412  memcpy(start_of_pass, other.start_of_pass, sizeof(*start_of_pass)*(passes+1));
413  prefix.predecide();
414  }
417  delete [] start_of_pass;
418  }
419 
420  double pass_probability(int pass) const {
421  return prob.of_range(first_index_of_pass(pass), last_index_of_pass(pass));
422  }
423 
424  int max_allowed_passes() const { return prob.get_prefix_count(); }
425  int number_of_passes() const { return passes; }
426  int split_depth() const { return prob.get_depth(); }
427 
428  bool done() const { return !legal_pass(current_pass); }
429  bool next() {
430  ++current_pass;
431  if (done()) return false;
432  markPrefixes();
433  return true;
434  }
435 
436  void reset() {
437  current_pass = 1;
438  markPrefixes();
439  }
440 
441  size_t estimate_probes_for_pass(int pass, size_t overall_base_count) const {
442  pt_assert(legal_pass(pass));
443  return size_t(pass_probability(pass)*overall_base_count+0.5);
444  }
445  size_t estimate_max_probes_for_any_pass(size_t overall_base_count) const {
446  if (max_probes_in_any_pass == 0) { // lazy eval
447  for (int p = 1; p <= passes; ++p) {
448  size_t probes = estimate_probes_for_pass(p, overall_base_count);
449  if (probes>max_probes_in_any_pass) max_probes_in_any_pass = probes;
450  }
451  pt_assert(max_probes_in_any_pass);
452  }
453  return max_probes_in_any_pass;
454  }
455  size_t estimate_max_kb_for_any_pass(size_t overall_base_count) const {
456  return estimate_stage1_memusage_kb(overall_base_count, estimate_max_probes_for_any_pass(overall_base_count));
457  }
458 
459  bool contains(const char *probe) const {
460  return prefix.isMarked(probe);
461  }
462 };
463 
464 inline size_t max_probes_for_passes(const PrefixProbabilities& prob, int passes_wanted, size_t overall_base_count) {
465  return Partition(prob, passes_wanted).estimate_max_probes_for_any_pass(overall_base_count);
466 }
467 inline size_t max_kb_for_passes(const PrefixProbabilities& prob, int passes_wanted, size_t overall_base_count) {
468  return estimate_stage1_memusage_kb(overall_base_count, max_probes_for_passes(prob, passes_wanted, overall_base_count));
469 }
470 
471 #else
472 #error PT_partition.h included twice
473 #endif // PT_PARTITION_H
void mark(int idx1, int idx2)
Definition: PT_partition.h:283
int find_index_near_leftsum(double wanted) const
Definition: PT_partition.h:137
static double base_probability(char base)
Definition: PT_partition.h:61
Definition: probe.h:83
double pass_probability(int pass) const
Definition: PT_partition.h:420
#define STAGE1_BYTES_PER_BASE
Definition: PT_partition.h:51
CONSTEXPR_INLINE unsigned char safeCharIndex(char c)
Definition: dupstr.h:73
void reset()
Definition: PT_partition.h:436
PrefixProbabilities(const PrefixProbabilities &other)
Definition: PT_partition.h:109
unsigned long ULONG
Definition: probe.h:50
size_t estimate_probes_for_pass(int pass, size_t overall_base_count) const
Definition: PT_partition.h:441
Partition(const PrefixProbabilities &prob_, int passes_)
Definition: PT_partition.h:393
size_t steps() const
void setValue(const char *probe, size_t len, bool wanted)
Definition: PT_partition.h:202
int number_of_passes() const
Definition: PT_partition.h:425
int get_depth() const
Definition: PT_partition.h:120
size_t length() const
Definition: PT_prefixIter.h:89
size_t estimate_max_kb_for_any_pass(size_t overall_base_count) const
Definition: PT_partition.h:455
double calc_probability(const char *prefix, size_t length)
Definition: PT_partition.h:75
bool isMarked(const char *probe) const
Definition: PT_partition.h:315
PrefixProbabilities(int depth_)
Definition: PT_partition.h:89
size_t max_probes_for_passes(const PrefixProbabilities &prob, int passes_wanted, size_t overall_base_count)
Definition: PT_partition.h:464
size_t estimate_max_probes_for_any_pass(size_t overall_base_count) const
Definition: PT_partition.h:445
#define STAGE1_BYTES_PER_PASS_OLIGO
Definition: PT_partition.h:50
#define false
Definition: ureadseq.h:13
DECLARE_ASSIGNMENT_OPERATOR(Partition)
#define CONSTEXPR_INLINE
Definition: cxxforward.h:111
double of(int prefix_idx)
Definition: PT_partition.h:132
Definition: probe.h:88
DECLARE_ASSIGNMENT_OPERATOR(MarkedPrefixes)
#define pt_assert(bed)
Definition: PT_tools.h:22
DECLARE_ASSIGNMENT_OPERATOR(PrefixProbabilities)
MarkedPrefixes(const MarkedPrefixes &other)
Definition: PT_partition.h:269
MarkedPrefixes(int depth_)
Definition: PT_partition.h:261
CONSTEXPR_INLINE ULONG estimate_stage1_memusage_kb(ULONG all_bp, ULONG partition_bp)
Definition: PT_partition.h:54
void optimize()
Definition: PT_partition.h:219
double left_of(int prefix_idx) const
Definition: PT_partition.h:122
double of_range(int first_idx, int last_idx) const
Definition: PT_partition.h:126
bool next()
Definition: PT_partition.h:429
Partition(const Partition &other)
Definition: PT_partition.h:404
bool getValue(const char *probe) const
Definition: PT_partition.h:214
bool done() const
Definition: PT_partition.h:428
const char * prefix() const
Definition: PT_prefixIter.h:88
Definition: probe.h:89
int get_prefix_count() const
Definition: PT_partition.h:119
#define NULp
Definition: cxxforward.h:116
bool done() const
bool contains(const char *probe) const
Definition: PT_partition.h:459
int max_allowed_passes() const
Definition: PT_partition.h:424
#define STAGE1_EXTRA_MB
Definition: PT_partition.h:52
size_t length
size_t max_kb_for_passes(const PrefixProbabilities &prob, int passes_wanted, size_t overall_base_count)
Definition: PT_partition.h:467
int split_depth() const
Definition: PT_partition.h:426
GB_ERROR mid(GBL_command_arguments *args, int start_index)
Definition: adlang1.cxx:907