ARB
ps_eval_candidates.cxx
Go to the documentation of this file.
1 // =============================================================== //
2 // //
3 // File : ps_eval_candidates.cxx //
4 // Purpose : //
5 // //
6 // Coded by Wolfram Foerster in October 2002 //
7 // Institute of Microbiology (Technical University Munich) //
8 // http://www.arb-home.de/ //
9 // //
10 // =============================================================== //
11 
12 #include "ps_tools.hxx"
13 #include "ps_database.hxx"
14 #include "ps_candidate.hxx"
15 
16 #include <cmath>
17 #include <sys/times.h>
18 
19 
20 void PS_get_leaf_candidates(PS_CandidatePtr _candidate_parent, PS_CandidateSet &_leaf_candidates) {
21 
22  for (PS_CandidateByGainMapRIter next_candidate = _candidate_parent->children.rbegin();
23  next_candidate != _candidate_parent->children.rend();
24  ++next_candidate) {
25 
26  if (next_candidate->second->children.size() == 0) {
27  _leaf_candidates.insert(next_candidate->second);
28  }
29  PS_get_leaf_candidates(&(*next_candidate->second), _leaf_candidates);
30  }
31 }
32 
33 
34 typedef set<PS_Node *> PS_NodeSet;
35 typedef pair<PS_CandidatePtr, PS_NodeSet> PS_Candidate2NodeSetPair;
36 typedef multimap<unsigned long, PS_Candidate2NodeSetPair> PS_Candidate2NodeSetPairByLengthMap;
37 
39  PS_NodeSet nodepath;
40  for (PS_CandidateSetCIter candidate_iter = _leaf_candidates.begin();
41  candidate_iter != _leaf_candidates.end();
42  ++candidate_iter) {
43  // get nodepath of candidate
44  PS_CandidateSPtr candidate_sptr = *candidate_iter;
45  PS_CandidatePtr candidate = &(*candidate_sptr);
46  nodepath.clear();
47  while ((candidate) &&
48  (!candidate->node.isNull())) {
49  nodepath.insert(&(*(candidate->node)));
50  candidate = candidate->parent;
51  }
52  // compare nodepath of candidate with stored paths
53  PS_Candidate2NodeSetPairByLengthMap::iterator stored_path;
54  bool found = false;
55  for (stored_path = _paths.find(candidate->depth); // find first path with path length == depth
56  ((stored_path != _paths.end()) && // while not at end of paths
57  (stored_path->first == candidate->depth) && // and path length == depth
58  (!found)); // and not found yet
59  ++stored_path) {
60  found = stored_path->second.second == nodepath;
61  }
62  if (!found) {
63  // not found -> store nodepath
64  candidate_sptr = *candidate_iter;
65  candidate = &(*candidate_sptr);
66  _paths.insert(pair<unsigned long, PS_Candidate2NodeSetPair>(candidate->depth, PS_Candidate2NodeSetPair(candidate, nodepath)));
67  }
68  }
69 }
70 
71 
72 typedef set<unsigned long> ULSet;
73 typedef set<unsigned short> USSet;
74 typedef map<float, float> FFMap;
75 typedef set<float> FSet;
76 
77 inline unsigned long PS_calc_temp(const PS_ProbePtr &_probe) {
78  return 4*(_probe->length - _probe->GC_content) + 2*_probe->GC_content;
79 }
80 
81 void PS_calc_sums_for_nodepath(PS_NodeSet &_nodepath, ULSet &_sums) {
82  // iterate over nodes in path
83  ULSet sums_for_node;
84  ULSet temperatures;
85 
86  for (PS_NodeSet::const_iterator node = _nodepath.begin();
87  node != _nodepath.end();
88  ++node) {
89 
90  // iterate over probes of node
91  sums_for_node.clear();
92  temperatures.clear();
93 
94  for (PS_ProbeSetCIter probe_iter = (*node)->getProbesBegin();
95  probe_iter != (*node)->getProbesEnd();
96  ++probe_iter) {
97  PS_ProbePtr probe = *probe_iter;
98 
99  // test if already calculated sums for temperature of probe
100  unsigned long temperature = PS_calc_temp(probe);
101  ULSet::iterator found = temperatures.find(temperature);
102  if (found != temperatures.end()) continue;
103 
104  // calc sums
105  temperatures.insert(temperature);
106 
107  for (ULSet::iterator sum = _sums.begin();
108  sum != _sums.end();
109  ++sum) {
110  sums_for_node.insert(*sum + temperature);
111  }
112  }
113 
114  // store sums_for_node in _sums
115  _sums = sums_for_node;
116  }
117 }
119  FFMap &_averages,
120  float &_best_average,
121  float &_min_sum) {
122  // iterate over nodes in path
123  ULSet temperatures;
124  FSet sums_for_average;
125  float min_sum_for_node = -2.0;
126  float best_average = -2.0;
127 
128  for (PS_NodeSet::const_iterator node = _nodepath.begin();
129  (node != _nodepath.end()) &&
130  ((_min_sum < 0) ||
131  (_min_sum > min_sum_for_node));
132  ++node) {
133 
134  // iterate over averages
135  for (FFMap::iterator average = _averages.begin();
136  average != _averages.end();
137  ++average) {
138  if ((_min_sum > 0) && (average->second > _min_sum)) continue;
139 
140  // iterate over probes of node
141  sums_for_average.clear();
142  temperatures.clear();
143 
144  for (PS_ProbeSetCIter probe_iter = (*node)->getProbesBegin();
145  probe_iter != (*node)->getProbesEnd();
146  ++probe_iter) {
147  PS_ProbePtr probe = *probe_iter;
148 
149  // test if already calculated sum for GC-content of probe
150  unsigned long temperature = PS_calc_temp(probe);
151  ULSet::iterator found = temperatures.find(temperature);
152  if (found != temperatures.end()) continue;
153 
154  // calc sum
155  temperatures.insert(temperature);
156  float distance = fabsf(average->first - temperature);
157  float sum = average->second + (distance * distance);
158  sums_for_average.insert(sum);
159  }
160 
161  // store min sum
162  average->second = *sums_for_average.begin();
163  }
164 
165  // update min sum for node
166  min_sum_for_node = _averages.begin()->second;
167  best_average = _averages.begin()->first;
168  for (FFMap::iterator average = _averages.begin();
169  average != _averages.end();
170  ++average) {
171  if (average->second < min_sum_for_node) {
172  min_sum_for_node = average->second;
173  best_average = average->first;
174  }
175  }
176  }
177 
178  if ((_min_sum < 0) || (min_sum_for_node < _min_sum)) {
179  printf("UPDATED _best_average (%7.3f) _min_sum (%10.3f) <- best_average (%7.3f) min_sum_for_node (%10.3f)\n", _best_average, _min_sum, best_average, min_sum_for_node);
180  _min_sum = min_sum_for_node;
181  _best_average = best_average;
182  return true;
183  }
184  else {
185  printf("aborted _best_average (%7.3f) _min_sum (%10.3f) best_average (%7.3f) min_sum_for_node (%10.3f)\n", _best_average, _min_sum, best_average, min_sum_for_node);
186  return false;
187  }
188 }
190  float &_min_sum_of_square_distances_to_average,
191  float &_best_average) {
192  //
193  // initially calc min square distance for one candidate
194  //
195  PS_Candidate2NodeSetPairByLengthMap::iterator stored_path = _paths.begin();
196  // calc sums of GC-contents of probes for nodes in candidate's path
197  ULSet sums;
198  sums.insert(0);
199  PS_calc_sums_for_nodepath(stored_path->second.second, sums);
200  // calc average GC-contents
201  FFMap averages;
202  for (ULSet::iterator sum = sums.begin();
203  sum != sums.end();
204  ++sum) {
205  float avg = (float)(*sum) / stored_path->first;
206  averages[avg] = 0.0;
207  }
208  // search minimum of sum of square distance to average
209  _min_sum_of_square_distances_to_average = -1;
210  printf("[%p] distance: ", stored_path->second.first);
211  PS_calc_min_sum_of_square_distances_to_average(stored_path->second.second,
212  averages,
213  _best_average,
214  _min_sum_of_square_distances_to_average);
215 
216  // iterate over remaining candidates
217  PS_Candidate2NodeSetPairByLengthMap::iterator best_candidate = stored_path;
218  for (++stored_path;
219  stored_path != _paths.end();
220  ++stored_path) {
221 
222  // calc sums
223  sums.clear();
224  sums.insert(0);
225  PS_calc_sums_for_nodepath(stored_path->second.second, sums);
226 
227  // calc averages
228  averages.clear();
229  for (ULSet::iterator sum = sums.begin();
230  sum != sums.end();
231  ++sum) {
232  float avg = (float)(*sum) / stored_path->first;
233  averages[avg] = 0.0;
234  }
235 
236  // search min distance
237  printf("[%p] distance: ", stored_path->second.first);
238  if (PS_calc_min_sum_of_square_distances_to_average(stored_path->second.second,
239  averages,
240  _best_average,
241  _min_sum_of_square_distances_to_average)) {
242  best_candidate = stored_path;
243  }
244  }
245 
246  // remove all but best candidate
247  PS_Candidate2NodeSetPairByLengthMap::iterator to_delete = _paths.end();
248  for (stored_path = _paths.begin();
249  stored_path != _paths.end();
250  ++stored_path) {
251  if (to_delete != _paths.end()) {
252  _paths.erase(to_delete);
253  to_delete = _paths.end();
254  }
255  if (stored_path != best_candidate) to_delete = stored_path;
256  }
257  if (to_delete != _paths.end()) {
258  _paths.erase(to_delete);
259  to_delete = _paths.end();
260  }
261 }
262 
264  float _average,
265  set<unsigned int> &_probe_lengths) {
266  for (PS_NodeSet::const_iterator node_iter = _nodes.begin();
267  node_iter != _nodes.end();
268  ++node_iter) {
269  PS_Node *node = *node_iter;
270 
271  // calc min distance
272  float min_distance = _average * _average;
273  for (PS_ProbeSetCIter probe = node->getProbesBegin();
274  probe != node->getProbesEnd();
275  ++probe) {
276  float distance = fabsf(_average - PS_calc_temp(*probe));
277  if (distance < min_distance) min_distance = distance;
278  }
279 
280  // remove probes with distance > min_distance
281  PS_ProbeSetCIter to_delete = node->getProbesEnd();
282  for (PS_ProbeSetCIter probe = node->getProbesBegin();
283  probe != node->getProbesEnd();
284  ++probe) {
285  if (to_delete != node->getProbesEnd()) {
286  node->removeProbe(to_delete);
287  to_delete = node->getProbesEnd();
288  }
289  float distance = fabsf(_average - PS_calc_temp(*probe));
290  if (distance > min_distance) {
291  to_delete = probe;
292  }
293  }
294  if (to_delete != node->getProbesEnd()) {
295  node->removeProbe(to_delete);
296  to_delete = node->getProbesEnd();
297  }
298 
299  // select probe with greatest length
300  if (node->countProbes() > 1) {
301 
302  // get greatest length
303  unsigned max_length = 0;
304  for (PS_ProbeSetCIter probe = node->getProbesBegin();
305  probe != node->getProbesEnd();
306  ++probe) {
307  if (max_length < (*probe)->length) max_length = (*probe)->length;
308  }
309 
310  // remove probes with length < max_length
311  for (PS_ProbeSetCIter probe = node->getProbesBegin();
312  probe != node->getProbesEnd();
313  ++probe) {
314  if (to_delete != node->getProbesEnd()) {
315  node->removeProbe(to_delete);
316  to_delete = node->getProbesEnd();
317  }
318  if ((*probe)->length < max_length) {
319  to_delete = probe;
320  }
321  }
322  if (to_delete != node->getProbesEnd()) {
323  node->removeProbe(to_delete);
324  to_delete = node->getProbesEnd();
325  }
326  }
327  _probe_lengths.insert((*node->getProbesBegin())->length);
328  }
329 }
330 
331 // ====================================================
332 // ====================================================
333 int main(int argc,
334  char *argv[]) {
335  //
336  // check arguments
337  //
338  if (argc < 3) {
339  printf("Missing argument\n Usage %s <database name> <final candidates filename>\n", argv[0]);
340  exit(1);
341  }
342 
343  const char *input_DB_name = argv[1];
344  const char *final_candidates_filename = argv[2];
345 
346  //
347  // open probe-set-database
348  //
349  printf("Opening probe-set-database '%s'..\n", input_DB_name);
350  PS_Database *db = new PS_Database(input_DB_name, PS_Database::READONLY);
351  db->load();
352  SpeciesID max_id = db->getMaxID();
353  unsigned long bits_in_map = ((max_id+1)*max_id)/2 + max_id+1;
354  printf("..loaded database (enter to continue)\n"); fflush(stdout);
355 
356  //
357  // candidates
358  //
359  printf("opening candidates-file '%s'..\n", final_candidates_filename);
360  PS_Candidate *candidates_root = new PS_Candidate(0.0);
361  PS_FileBuffer *candidates_file = new PS_FileBuffer(final_candidates_filename, PS_FileBuffer::READONLY);
362  candidates_root->load(candidates_file, bits_in_map, db->getConstRootNode());
363  delete candidates_file;
364  printf("..loaded candidates file (enter to continue)\n"); fflush(stdout);
365 
366  //
367  // scan candidates-tree for leaf candidates
368  //
369  printf("\nsearching leaf candidates.. ");
370  PS_CandidateSet leaf_candidates;
371  PS_get_leaf_candidates(candidates_root, leaf_candidates);
372  printf("%zu\n", leaf_candidates.size());
373 
374  //
375  // scan leaf-candidates for non-permutated node-paths
376  //
377  printf("\ngetting node paths for leaf candidates..");
379  PS_get_node_paths(leaf_candidates, node_paths);
380  printf("%zu\n", node_paths.size());
381  for (PS_Candidate2NodeSetPairByLengthMap::const_iterator stored_path = node_paths.begin();
382  stored_path != node_paths.end();
383  ++stored_path) {
384  printf("length (%3lu) candidate [%p] nodes (%3zu) ",
385  stored_path->first,
386  stored_path->second.first,
387  stored_path->second.second.size());
388  unsigned long sum = 0;
389  for (PS_NodeSet::const_iterator node = stored_path->second.second.begin();
390  node != stored_path->second.second.end();
391  ++node) {
392  sum += (*node)->countProbes();
393  }
394  printf("probes (%6lu)\n", sum); fflush(stdout);
395  }
396 
397  //
398  // eval node paths
399  //
400  printf("\nevaluating node paths of leaf candidates..\n");
401  float min_sum_of_square_distances_to_average;
402  float average;
403  PS_eval_node_paths(node_paths, min_sum_of_square_distances_to_average, average);
404  printf(" best candidate : average (%f) sum of square distances to average (%f)\n ", average, min_sum_of_square_distances_to_average);
405  node_paths.begin()->second.first->print();
406 
407  //
408  // remove probes with unwanted distance or length
409  //
410  printf("\nremoving unwanted probes from best candidate..\n");
411  set<unsigned int> probe_lengths;
412  PS_remove_bad_probes(node_paths.begin()->second.second, average, probe_lengths);
413 
414  //
415  // write out paths to probes
416  //
417  char *final_candidates_paths_filename = (char *)malloc(strlen(final_candidates_filename)+1+6);
418  strcpy(final_candidates_paths_filename, final_candidates_filename);
419  strcat(final_candidates_paths_filename, ".paths");
420  printf("Writing final candidate's paths to '%s'..\n", final_candidates_paths_filename);
421  PS_FileBuffer *final_candidates_paths_file = new PS_FileBuffer(final_candidates_paths_filename, PS_FileBuffer::WRITEONLY);
422  PS_CandidatePtr c = node_paths.begin()->second.first;
423  unsigned long min_temp = PS_calc_temp(*c->node->getProbesBegin());
424  unsigned long max_temp = min_temp;
425  // write count of paths
426  final_candidates_paths_file->put_ulong(c->depth);
427  // write used probe lengths (informal)
428  final_candidates_paths_file->put_uint(probe_lengths.size());
429  printf(" probe lengths :");
430  for (set<unsigned int>::iterator length = probe_lengths.begin();
431  length != probe_lengths.end();
432  ++length) {
433  final_candidates_paths_file->put_uint(*length);
434  printf(" %u", *length);
435  }
436  printf("\n");
437  // write paths
438  while (c && !c->node.isNull()) {
440  unsigned long temp = PS_calc_temp(*probe);
441  if (temp < min_temp) min_temp = temp;
442  if (temp > max_temp) max_temp = temp;
443  printf("depth (%lu) candidate [%p] node [%p] probe (q_%+i__l_%2u__gc_%2u__temp_%3lu)\n",
444  c->depth,
445  c,
446  &(*(c->node)),
447  (*probe)->quality,
448  (*probe)->length,
449  (*probe)->GC_content,
450  temp);
451  // write probe length
452  final_candidates_paths_file->put_uint((*probe)->length);
453  // write probe GC-content
454  final_candidates_paths_file->put_uint((*probe)->GC_content);
455 
456  if ((*probe)->quality >= 0) {
457  // write path length
458  final_candidates_paths_file->put_uint(c->path.size());
459  // write path
460  for (IDSetCIter id = c->path.begin();
461  id != c->path.end();
462  ++id) {
463  final_candidates_paths_file->put_int(*id);
464  }
465  }
466  else {
467  // write inverse path length
468  final_candidates_paths_file->put_uint(db->getSpeciesCount() - c->path.size());
469  // write inverse path
470  IDSetCIter path_it = c->path.begin();
471  SpeciesID path_id = *path_it;
472  for (SpeciesID id = db->getMinID();
473  id <= db->getMaxID();
474  ++id) {
475  if (id == path_id) {
476  ++path_it;
477  path_id = (path_it == c->path.end()) ? -1 : *path_it;
478  continue;
479  }
480  final_candidates_paths_file->put_int(id);
481  }
482  }
483  // write dummy probe data
484  for (unsigned int i = 0; i < (*probe)->length; ++i) {
485  final_candidates_paths_file->put_char('\x00');
486  }
487  c = c->parent;
488  }
489  // write probe lengths again
490  // this set is used store remaining 'todo probe-lengths'
491  final_candidates_paths_file->put_uint(probe_lengths.size());
492  for (set<unsigned int>::iterator length = probe_lengths.begin();
493  length != probe_lengths.end();
494  ++length) {
495  final_candidates_paths_file->put_uint(*length);
496  }
497 
498  printf(" temperature range %lu..%lu\n", min_temp, max_temp);
499  free(final_candidates_paths_filename);
500  delete final_candidates_paths_file;
501 
502  //
503  // clean up
504  //
505  printf("cleaning up... candidates\n"); fflush(stdout);
506  delete candidates_root;
507  printf("cleaning up... database\n"); fflush(stdout);
508  delete db;
509 
510  return 0;
511 }
void put_ulong(unsigned long int _ul)
void PS_eval_node_paths(PS_Candidate2NodeSetPairByLengthMap &_paths, float &_min_sum_of_square_distances_to_average, float &_best_average)
static const bool READONLY
void put_char(unsigned char _c)
map< float, float > FFMap
const char * id
Definition: AliAdmin.cxx:17
set< unsigned long > ULSet
int main(int argc, char *argv[])
void removeProbe(PS_ProbeSetCIter it)
Definition: ps_node.hxx:185
void put_int(int _i)
PS_CandidateSet::const_iterator PS_CandidateSetCIter
void PS_get_leaf_candidates(PS_CandidatePtr _candidate_parent, PS_CandidateSet &_leaf_candidates)
PS_ProbeSet::const_iterator PS_ProbeSetCIter
Definition: ps_node.hxx:52
size_t countProbes() const
Definition: ps_node.hxx:158
multimap< unsigned long, PS_Candidate2NodeSetPair > PS_Candidate2NodeSetPairByLengthMap
set< float > FSet
bool isNull() const
test if SmartPtr is NULp
Definition: smartptr.h:248
PS_ProbeSetCIter getProbesBegin() const
Definition: ps_node.hxx:176
bool PS_calc_min_sum_of_square_distances_to_average(PS_NodeSet &_nodepath, FFMap &_averages, float &_best_average, float &_min_sum)
void put_uint(unsigned int _ui)
PS_ProbeSetCIter getProbesEnd() const
Definition: ps_node.hxx:180
void PS_remove_bad_probes(PS_NodeSet &_nodes, float _average, set< unsigned int > &_probe_lengths)
fflush(stdout)
set< PS_CandidateSPtr, cmp_candidates > PS_CandidateSet
set< unsigned short > USSet
Generic smart pointer.
Definition: smartptr.h:149
PS_CandidateByGainMap children
pair< PS_CandidatePtr, PS_NodeSet > PS_Candidate2NodeSetPair
PS_CandidateByGainMap::reverse_iterator PS_CandidateByGainMapRIter
unsigned long depth
void PS_get_node_paths(PS_CandidateSet &_leaf_candidates, PS_Candidate2NodeSetPairByLengthMap &_paths)
set< PS_Node * > PS_NodeSet
PS_Candidate * parent
void PS_calc_sums_for_nodepath(PS_NodeSet &_nodepath, ULSet &_sums)
unsigned long PS_calc_temp(const PS_ProbePtr &_probe)
void load(PS_FileBuffer *_file, const unsigned long _bits_in_map, const PS_NodePtr &_root_node)
PS_NodePtr node
size_t length
static const bool WRITEONLY
IDSet::const_iterator IDSetCIter
Definition: ps_defs.hxx:41