ARB
ps_candidate.hxx
Go to the documentation of this file.
1 // =============================================================== //
2 // //
3 // File : ps_candidate.hxx //
4 // Purpose : //
5 // //
6 // Coded by Wolfram Foerster in October 2002 //
7 // Institute of Microbiology (Technical University Munich) //
8 // http://www.arb-home.de/ //
9 // //
10 // =============================================================== //
11 
12 #ifndef PS_CANDIDATE_HXX
13 #define PS_CANDIDATE_HXX
14 
15 #ifndef PS_DEFS_HXX
16 #include "ps_defs.hxx"
17 #endif
18 #ifndef PS_BITMAP_HXX
19 #include "ps_bitmap.hxx"
20 #endif
21 
22 #ifndef _GLIBCXX_CLIMITS
23 #include <climits>
24 #endif
25 
26 using namespace std;
27 
31 
33  bool operator()(const PS_CandidateSPtr &c1, const PS_CandidateSPtr &c2) const {
34  return &(*c1) < &(*c2);
35  }
36 };
37 typedef set<PS_CandidateSPtr, cmp_candidates> PS_CandidateSet;
38 typedef PS_CandidateSet::iterator PS_CandidateSetIter;
39 typedef PS_CandidateSet::const_iterator PS_CandidateSetCIter;
40 typedef PS_CandidateSet::reverse_iterator PS_CandidateSetRIter;
41 typedef PS_CandidateSet::const_reverse_iterator PS_CandidateSetCRIter;
42 
43 typedef map<unsigned long, PS_CandidateSPtr> PS_CandidateByGainMap;
44 typedef PS_CandidateByGainMap::iterator PS_CandidateByGainMapIter;
45 typedef PS_CandidateByGainMap::const_iterator PS_CandidateByGainMapCIter;
46 typedef PS_CandidateByGainMap::reverse_iterator PS_CandidateByGainMapRIter;
47 typedef PS_CandidateByGainMap::const_reverse_iterator PS_CandidateByGainMapCRIter;
48 
49 typedef pair<SpeciesID, PS_BitSet::IndexSet> ID2IndexSetPair;
50 typedef set<ID2IndexSetPair> ID2IndexSetSet;
51 typedef ID2IndexSetSet::iterator ID2IndexSetSetIter;
52 typedef ID2IndexSetSet::const_iterator ID2IndexSetSetCIter;
53 typedef ID2IndexSetSet::reverse_iterator ID2IndexSetSetRIter;
54 typedef ID2IndexSetSet::const_reverse_iterator ID2IndexSetSetCRIter;
55 
56 class PS_Candidate : virtual Noncopyable {
57  PS_Candidate();
58  PS_Candidate(const PS_Candidate&);
59  explicit PS_Candidate(float _distance, unsigned long _gain, const PS_NodePtr& _ps_node, IDSet &_path, PS_CandidatePtr _parent) {
60  filling_level = 0.0;
61  depth = ULONG_MAX;
62  distance = _distance;
63  gain = _gain;
64  passes_left = MAX_PASSES;
65  parent = _parent;
66  false_IDs = 0;
67  one_false_IDs = NULp;
68  one_false_IDs_matches = 0;
69  map = NULp;
70  node = _ps_node;
71  path = _path;
72  }
73 
74 public:
75  static const unsigned int MAX_PASSES = 3;
77  unsigned long depth;
78  float distance;
79  unsigned long gain;
80  unsigned int passes_left;
82  unsigned long false_IDs;
84  unsigned long one_false_IDs_matches;
85  PS_BitMap_Counted *map;
89 
90  unsigned long initFalseIDs(const SpeciesID _min_id,
91  const SpeciesID _max_id,
92  SpeciesID &_min_sets_id,
93  SpeciesID &_max_sets_id) {
94  // if i already have the set return its size
95  if (one_false_IDs) {
96  return one_false_IDs->size();
97  }
98  else {
99  one_false_IDs = new ID2IDSet;
100  false_IDs = 0;
101  PS_BitSet::IndexSet falseIndices; // temp. set to hold IDs of falses per ID
102  // iterate over _min_id .. _max_id range in map
103  for (SpeciesID id1 = _min_id;
104  id1 <= _max_id;
105  ++id1) {
106  if (map->getCountFor(id1) == _max_id+1) continue; // skip ID if its already filled
107  if (id1 < _min_sets_id) _min_sets_id = id1;
108  if (id1 > _max_sets_id) _max_sets_id = id1;
109  map->getFalseIndicesFor(id1, falseIndices);
110  while ((*(falseIndices.begin()) < id1) &&
111  !falseIndices.empty()) falseIndices.erase(*falseIndices.begin());
112  if (falseIndices.empty()) continue;
113  if (*falseIndices.begin() < _min_sets_id) _min_sets_id = *falseIndices.begin();
114  if (*falseIndices.rbegin() > _max_sets_id) _max_sets_id = *falseIndices.rbegin();
115  // store
116  false_IDs += falseIndices.size();
117  if (falseIndices.size() == 1) {
118  one_false_IDs->insert(ID2IDPair(id1, *falseIndices.begin()));
119  }
120  }
121  }
122  return one_false_IDs->size();
123  }
124 
125  unsigned long matchPathOnOneFalseIDs(IDSet &_path) {
126  unsigned long matches = 0;
127  IDSetCIter path_end = _path.end();
128  for (ID2IDSetCIter p = one_false_IDs->begin();
129  p != one_false_IDs->end();
130  ++p) {
131  if (_path.find(p->first) == path_end) {
132  if (_path.find(p->second) != path_end) ++matches;
133  }
134  else {
135  if (_path.find(p->second) == path_end) ++matches;
136  }
137  }
138  return matches;
139  }
140 
141  void decreasePasses() {
142  --passes_left;
143  }
144 
145  void getParentMap() {
146  if (!parent) return;
147  map = parent->map; // grab parents version of the map
148  parent->map = NULp; // unbind parent from map
149  }
150 
151  bool hasChild(PS_NodePtr _ps_node) const {
152  PS_CandidateByGainMapCIter found = children.end();
153  for (PS_CandidateByGainMapCIter child = children.begin();
154  (child != children.end()) && (found == children.end());
155  ++child) {
156  if (child->second->node == _ps_node) found = child;
157  }
158  return found != children.end();
159  }
160 
161  bool alreadyUsedNode(const PS_NodePtr& _ps_node) const {
162  if (node.isNull()) return false;
163  if (_ps_node == node) {
164  return true;
165  }
166  else {
167  return parent->alreadyUsedNode(_ps_node);
168  }
169  }
170 
171  int addChild(unsigned long _distance,
172  unsigned long _gain,
173  const PS_NodePtr& _node,
174  IDSet& _path)
175  {
176  PS_CandidateByGainMapIter found = children.find(_gain);
177  if (found == children.end()) {
178  PS_CandidateSPtr new_child(new PS_Candidate(_distance, _gain, _node, _path, this));
179  children[_gain] = new_child;
180  return 2;
181  }
182  else if (_distance < found->second->distance) {
183  children.erase(_gain);
184  PS_CandidateSPtr new_child(new PS_Candidate(_distance, _gain, _node, _path, this));
185  children[_gain] = new_child;
186  return 1;
187  }
188  return 0;
189  }
190 
191  bool updateBestChild(const unsigned long _gain,
192  const unsigned long _one_false_IDs_matches,
193  const float _filling_level,
194  const PS_NodePtr& _node,
195  IDSet& _path)
196  {
197  if (children.empty()) {
198  // no child yet
199  PS_CandidateSPtr new_child(new PS_Candidate(0, _gain, _node, _path, this));
200  new_child->depth = depth+1;
201  new_child->filling_level = _filling_level;
202  if (_filling_level >= 100.0) new_child->passes_left = 0;
203  children[0] = new_child;
204  one_false_IDs_matches = _one_false_IDs_matches;
205  return true;
206  }
207  // return false if new child matches less 'must matches' than best child so far
208  if (_one_false_IDs_matches < one_false_IDs_matches) return false;
209  // return false if new child matches same 'must matches' but has less total gain
210  if ((_one_false_IDs_matches == one_false_IDs_matches) &&
211  (_gain <= children[0]->gain)) return false;
212 
213  children.clear();
214  PS_CandidateSPtr new_child(new PS_Candidate(0, _gain, _node, _path, this));
215  new_child->depth = depth+1;
216  new_child->filling_level = _filling_level;
217  if (_filling_level >= 100.0) new_child->passes_left = 0;
218  children[0] = new_child;
219  one_false_IDs_matches = _one_false_IDs_matches;
220  return true;
221  }
222 
223  void reduceChildren(const float _filling_level) {
224  if (children.empty()) return;
225  // prepare
226  unsigned long best_gain = children.rbegin()->first;
227  unsigned long worst_gain = children.begin()->first;
228  unsigned long middle_gain = (best_gain + worst_gain) >> 1;
229  PS_CandidateByGainMapIter middle_child = children.find(middle_gain);
230  if (middle_child == children.end()) {
231  middle_child = children.lower_bound(middle_gain);
232  middle_gain = middle_child->first;
233  }
234  PS_CandidateByGainMapIter to_delete = children.end();
235  // delete unwanted children depending on filling level
236  if (_filling_level < 50.0) {
237  if (children.size() <= 3) return;
238  for (PS_CandidateByGainMapIter c = children.begin();
239  c != children.end();
240  ++c) {
241  if (to_delete != children.end()) {
242  children.erase(to_delete);
243  to_delete = children.end();
244  }
245  if ((c->first != worst_gain) &&
246  (c->first != middle_gain) &&
247  (c->first != best_gain)) {
248  to_delete = c;
249  }
250  }
251  }
252  else if (_filling_level < 75.0) {
253  if (children.size() <= 2) return;
254  for (PS_CandidateByGainMapIter c = children.begin();
255  c != children.end();
256  ++c) {
257  if (to_delete != children.end()) {
258  children.erase(to_delete);
259  to_delete = children.end();
260  }
261  if ((c->first != middle_gain) &&
262  (c->first != best_gain)) {
263  to_delete = c;
264  }
265  }
266  }
267  else {
268  if (children.size() <= 1) return;
269  for (PS_CandidateByGainMapIter c = children.begin();
270  c != children.end();
271  ++c) {
272  if (to_delete != children.end()) {
273  children.erase(to_delete);
274  to_delete = children.end();
275  }
276  if (c->first != best_gain) {
277  to_delete = c;
278  }
279  }
280  }
281  if (to_delete != children.end()) {
282  children.erase(to_delete);
283  to_delete = children.end();
284  }
285  }
286 
287  void printProbes(const SpeciesID _species_count,
288  const unsigned long _depth = 0,
289  const bool _descend = true) const {
290  for (unsigned long i = 0; i < _depth; ++i) {
291  printf("| ");
292  }
293  printf("\n");
294  if (node.isNull()) {
295  for (unsigned long i = 0; i < _depth; ++i) {
296  printf("| ");
297  }
298  printf("[%p] depth (%lu) no node\n", this, _depth);
299  }
300  else if (node->countProbes() == 0) {
301  for (unsigned long i = 0; i < _depth; ++i) {
302  printf("| ");
303  }
304  printf("[%p] depth (%lu) node (%p) no probes\n", this, _depth, &(*node));
305  }
306  else {
307  for (PS_ProbeSetCIter probe = node->getProbesBegin();
308  probe != node->getProbesEnd();
309  ++probe) {
310  for (unsigned long i = 0; i < _depth; ++i) {
311  printf("| ");
312  }
313  printf("[%p] depth (%lu) node (%p) matches (%zu) %u %u\n",
314  this,
315  _depth,
316  &(*node),
317  ((*probe)->quality > 0) ? path.size() : _species_count - path.size(),
318  (*probe)->length,
319  (*probe)->GC_content);
320  }
321  }
322  fflush(stdout);
323 
324  if (_descend) {
325  for (PS_CandidateByGainMapCRIter child = children.rbegin();
326  child != children.rend();
327  ++child) {
328  child->second->printProbes(_species_count, _depth+1);
329  }
330  }
331 
332  }
333 
334  void print (const unsigned long _depth = 0,
335  const bool _print_one_false_IDs = false,
336  const bool _descend = true) const {
337  for (unsigned long i = 0; i < _depth; ++i) {
338  printf("| ");
339  }
340  printf("[%p] passes left (%u) filling level (%.5f) distance (%6.2f) gain (%6lu) depth (%3lu) path length (%4zu) ",
341  this,
342  passes_left,
343  filling_level,
344  distance,
345  gain,
346  depth,
347  path.size());
348  if (node.isNull()) {
349  printf("node (undefined) children (%2zu) ", children.size());
350  }
351  else {
352  printf("node (%p) children (%2zu) ", &(*node), children.size());
353  }
354  printf("(%4lu %4zu)/%lu", one_false_IDs_matches, (one_false_IDs) ? one_false_IDs->size() : 0, false_IDs);
355  if (_print_one_false_IDs) {
356  printf("\n");
357  for (unsigned long i = 0; i < _depth; ++i) {
358  printf("| ");
359  }
360  printf(" one_false_IDs : ");
361  if (one_false_IDs) {
362  for (ID2IDSetCIter p = one_false_IDs->begin();
363  p != one_false_IDs->end();
364  ++p) {
365  printf("(%i %i) ", p->first, p->second);
366  }
367  }
368  else {
369  printf("none");
370  }
371  }
372  printf("\n"); fflush(stdout);
373  if (_descend) {
374  for (PS_CandidateByGainMapCRIter child = children.rbegin();
375  child != children.rend();
376  ++child) {
377  child->second->print(_depth+1);
378  }
379  }
380  }
381 
382  void save(PS_FileBuffer *_file,
383  const unsigned long _bits_in_map) {
384  unsigned long count;
385  // gain
386  _file->put_ulong(gain);
387  // passes_left
388  _file->put_uint(passes_left);
389  // false_IDs
390  _file->put_ulong(false_IDs);
391  // one_false_IDs
392  if (one_false_IDs) {
393  count = one_false_IDs->size();
394  _file->put_ulong(count);
395  for (ID2IDSetCIter p = one_false_IDs->begin();
396  p != one_false_IDs->end();
397  ++p) {
398  _file->put_int(p->first);
399  _file->put_int(p->second);
400  }
401  }
402  else {
403  _file->put_ulong(0);
404  }
405  // one_false_IDs_matches
406  _file->put_ulong(one_false_IDs_matches);
407  // path
408  count = path.size();
409  _file->put_ulong(count);
410  for (IDSetCIter id = path.begin();
411  id != path.end();
412  ++id) {
413  _file->put_int(*id);
414  }
415  // children
416  count = children.size();
417  _file->put_ulong(count);
418  for (PS_CandidateByGainMapIter child = children.begin();
419  child != children.end();
420  ++child) {
421  child->second->save(_file, _bits_in_map);
422  }
423  }
424 
425  void load(PS_FileBuffer *_file,
426  const unsigned long _bits_in_map,
427  const PS_NodePtr& _root_node)
428  {
429  unsigned long count;
430  // gain
431  _file->get_ulong(gain);
432  // passes_left
433  _file->get_uint(passes_left);
434  // false_IDs & filling_level
435  _file->get_ulong(false_IDs);
436  filling_level = (float)(_bits_in_map - false_IDs) / _bits_in_map * 100.0;
437  // one_false_IDs
438  SpeciesID id1;
439  SpeciesID id2;
440  _file->get_ulong(count);
441  one_false_IDs = (count) ? new ID2IDSet : NULp;
442  for (; count > 0; --count) {
443  _file->get_int(id1);
444  _file->get_int(id2);
445  one_false_IDs->insert(ID2IDPair(id1, id2));
446  }
447  // one_false_IDs_matches
448  _file->get_ulong(one_false_IDs_matches);
449  // path & node
450  _file->get_ulong(count);
451  if (count) node = _root_node;
452  for (; count > 0; --count) {
453  _file->get_int(id1);
454  path.insert(id1);
455  node = node->getChild(id1).second;
456  }
457  // children
458  _file->get_ulong(count);
459  for (; count > 0; --count) {
460  PS_CandidateSPtr new_child(new PS_Candidate(0.0));
461  new_child->depth = depth+1;
462  new_child->parent = this;
463  new_child->load(_file, _bits_in_map, _root_node);
464  children[new_child->gain] = new_child;
465  }
466  }
467 
468  explicit PS_Candidate(float _distance) {
469  filling_level = 0.0;
470  depth = 0;
471  distance = _distance;
472  gain = 0;
473  passes_left = 0;
474  parent = NULp;
475  false_IDs = 0;
476  one_false_IDs = NULp;
477  one_false_IDs_matches = 0;
478  map = NULp;
479  node.setNull();
480  }
482  if (map) delete map;
483  if (one_false_IDs) delete one_false_IDs;
484 
485  path.clear();
486  children.clear();
487  }
488 };
489 
490 #else
491 #error ps_candidate.hxx included twice
492 #endif // PS_CANDIDATE_HXX
void put_ulong(unsigned long int _ul)
PS_BitMap_Counted * map
unsigned long gain
const char * id
Definition: AliAdmin.cxx:17
unsigned int passes_left
set< ID2IndexSetPair > ID2IndexSetSet
PS_CandidateSet::iterator PS_CandidateSetIter
std::set< SpeciesID > IDSet
Definition: ps_defs.hxx:39
PS_CandidateSet::const_reverse_iterator PS_CandidateSetCRIter
bool alreadyUsedNode(const PS_NodePtr &_ps_node) const
void put_int(int _i)
void printProbes(const SpeciesID _species_count, const unsigned long _depth=0, const bool _descend=true) const
PS_CandidateSet::const_iterator PS_CandidateSetCIter
void decreasePasses()
PS_Candidate * PS_CandidatePtr
PS_ProbeSet::const_iterator PS_ProbeSetCIter
Definition: ps_node.hxx:52
size_t countProbes() const
Definition: ps_node.hxx:158
unsigned long false_IDs
STL namespace.
bool isNull() const
test if SmartPtr is NULp
Definition: smartptr.h:248
PS_ProbeSetCIter getProbesBegin() const
Definition: ps_node.hxx:176
void setNull()
set SmartPtr to NULp
Definition: smartptr.h:251
void put_uint(unsigned int _ui)
bool operator()(const PS_CandidateSPtr &c1, const PS_CandidateSPtr &c2) const
PS_ProbeSetCIter getProbesEnd() const
Definition: ps_node.hxx:180
map< unsigned long, PS_CandidateSPtr > PS_CandidateByGainMap
void get_uint(unsigned int &_ui)
ID2IndexSetSet::const_iterator ID2IndexSetSetCIter
ID2IndexSetSet::reverse_iterator ID2IndexSetSetRIter
fflush(stdout)
set< PS_CandidateSPtr, cmp_candidates > PS_CandidateSet
void getParentMap()
SmartPtr< PS_Candidate > PS_CandidateSPtr
PS_CandidateByGainMap::iterator PS_CandidateByGainMapIter
unsigned long one_false_IDs_matches
Generic smart pointer.
Definition: smartptr.h:149
PS_CandidateByGainMap children
PS_CandidateSet::reverse_iterator PS_CandidateSetRIter
unsigned long initFalseIDs(const SpeciesID _min_id, const SpeciesID _max_id, SpeciesID &_min_sets_id, SpeciesID &_max_sets_id)
ID2IDSet * one_false_IDs
set< long > IndexSet
Definition: ps_bitset.hxx:27
std::set< ID2IDPair > ID2IDSet
Definition: ps_defs.hxx:54
PS_CandidateByGainMap::reverse_iterator PS_CandidateByGainMapRIter
ID2IDSet::const_iterator ID2IDSetCIter
Definition: ps_defs.hxx:56
unsigned long matchPathOnOneFalseIDs(IDSet &_path)
PS_Candidate(float _distance)
void reduceChildren(const float _filling_level)
int addChild(unsigned long _distance, unsigned long _gain, const PS_NodePtr &_node, IDSet &_path)
unsigned long depth
bool hasChild(PS_NodePtr _ps_node) const
bool updateBestChild(const unsigned long _gain, const unsigned long _one_false_IDs_matches, const float _filling_level, const PS_NodePtr &_node, IDSet &_path)
void get_int(int &_i)
void get_ulong(unsigned long int &_ul)
float filling_level
PS_Candidate * parent
PS_CandidateByGainMap::const_reverse_iterator PS_CandidateByGainMapCRIter
#define NULp
Definition: cxxforward.h:116
void load(PS_FileBuffer *_file, const unsigned long _bits_in_map, const PS_NodePtr &_root_node)
pair< bool, PS_NodePtr > getChild(SpeciesID id)
Definition: ps_node.hxx:101
ID2IndexSetSet::iterator ID2IndexSetSetIter
PS_CandidateByGainMap::const_iterator PS_CandidateByGainMapCIter
PS_NodePtr node
void print(const unsigned long _depth=0, const bool _print_one_false_IDs=false, const bool _descend=true) const
ID2IndexSetSet::const_reverse_iterator ID2IndexSetSetCRIter
pair< SpeciesID, PS_BitSet::IndexSet > ID2IndexSetPair
std::pair< SpeciesID, SpeciesID > ID2IDPair
Definition: ps_defs.hxx:53
void save(PS_FileBuffer *_file, const unsigned long _bits_in_map)
IDSet::const_iterator IDSetCIter
Definition: ps_defs.hxx:41