ARB
SQ_functions.cxx
Go to the documentation of this file.
1 // ==================================================================== //
2 // //
3 // File : SQ_functions.cxx //
4 // Purpose : Implementation of SQ_functions.h //
5 // //
6 // //
7 // Coded by Juergen Huber in July 2003 - February 2004 //
8 // Coded by Kai Bader (baderk@in.tum.de) in 2007 - 2008 //
9 // Copyright Department of Microbiology (Technical University Munich) //
10 // //
11 // Visit our web site at: http://www.arb-home.de/ //
12 // //
13 // ==================================================================== //
14 
15 #include "SQ_ambiguities.h"
16 #include "SQ_helix.h"
17 #include "SQ_physical_layout.h"
18 #include "SQ_functions.h"
19 
20 #include <aw_preset.hxx>
21 #include <arb_progress.h>
22 #include <TreeNode.h>
23 
24 using namespace std;
25 
26 typedef GBDATA *(*species_iterator)(GBDATA *);
27 
29 
30 enum {
32 };
33 
36  swap(tmp, group_dict);
37 }
38 
39 static GB_ERROR no_data_error(GBDATA *gb_species, const char *ali_name) {
40  GBDATA *gb_name = GB_entry(gb_species, "name");
41  const char *name = gb_name ? GB_read_char_pntr(gb_name) : "<unknown>";
42  return GBS_global_string("Species '%s' has no data in alignment '%s'", name, ali_name);
43 }
44 
45 static int sq_round(double value) {
46  int x;
47 
48  value += 0.5;
49  x = (int) floor(value);
50  return x;
51 }
52 
54  GB_transaction ta(gb_main);
56 
57  for (GBDATA *gb_species = GBT_first_species(gb_main);
58  gb_species && !error;
59  gb_species = GBT_next_species(gb_species))
60  {
61  GBDATA *gb_quality = GB_search(gb_species, "quality", GB_FIND);
62  if (gb_quality) {
63  error = GB_delete(gb_quality);
64  }
65  }
66  return ta.close(error);
67 }
68 
70  struct {
71  const char *key;
72  GB_TYPES type;
73  }
74  keys2add[] = {
75  // fields written by SQ_calc_physical_layout:
76  { "number_of_bases", GB_INT },
77  { "percent_of_bases", GB_INT },
78  { "GC_proportion", GB_FLOAT },
79 
80  // fields written by SQ_calc_helix_layout:
81  { "number_of_no_helix", GB_INT },
82  { "number_of_weak_helix", GB_INT },
83  { "number_of_strong_helix", GB_INT },
84 
85  // fields written by SQ_pass2 + SQ_pass2_no_tree:
86  { "percent_base_deviation", GB_INT },
87  { "percent_GC_difference", GB_INT },
88 
89  { "consensus_conformity/name", GB_STRING },
90  { "consensus_conformity/value", GB_FLOAT },
91  { "consensus_conformity/num_species", GB_INT },
92 
93  { "consensus_deviation/name", GB_STRING },
94  { "consensus_deviation/value", GB_FLOAT },
95  { "consensus_deviation/num_species", GB_INT },
96 
97  { "consensus_evaluated", GB_INT },
98 
99  // fields written by SQ_evaluate
100  { "evaluation", GB_INT },
101 
102  { NULp, GB_NONE }
103  };
104 
105  GB_ERROR error = NULp;
106  for (int i = 0; keys2add[i].key && !error; ++i) {
107  const char *fullkey = GBS_global_string("quality/%s/%s", alignment_name, keys2add[i].key);
108  error = GBT_add_new_species_changekey(gb_main, fullkey, keys2add[i].type);
109  }
110 
111  return error;
112 }
113 
114 
115 GB_ERROR SQ_evaluate(GBDATA *gb_main, const SQ_weights& weights, bool marked_only) {
116  char *alignment_name = GBT_get_default_alignment(gb_main); seq_assert(alignment_name);
117 
120 
121  GB_ERROR error = NULp;
122  for (GBDATA *gb_species = getFirst(gb_main); gb_species && !error; gb_species = getNext(gb_species)) {
123  GBDATA *gb_name = GB_entry(gb_species, "name");
124  if (!gb_name) {
125  error = GB_get_error();
126  }
127  else {
128  GBDATA *gb_quality = GB_entry(gb_species, "quality");
129  if (gb_quality) {
130  GBDATA *gb_quality_ali = GB_entry(gb_quality, alignment_name);
131 
132  if (!gb_quality_ali) {
133  error = GBS_global_string("No alignment entry '%s' in quality data", alignment_name);
134  }
135  else {
136 
137  // evaluate the percentage of bases the actual sequence consists of
138  GBDATA *gb_result1 = GB_search(gb_quality_ali, "percent_of_bases", GB_INT);
139  int bases = GB_read_int(gb_result1);
140 
141  double result = bases<4 ? 0 : (bases<6 ? 1 : 2);
142  if (result != 0) result = (result * weights.bases) / 2;
143 
144  double value = 0;
145  value += result;
146 
147  // evaluate the difference in number of bases from sequence to group
148  GBDATA *gb_result2 = GB_search(gb_quality_ali, "percent_base_deviation", GB_INT);
149  int dfa = abs(GB_read_int(gb_result2));
150 
151  if (dfa < 2) result = 5;
152  else if (dfa < 4) result = 4;
153  else if (dfa < 6) result = 3;
154  else if (dfa < 8) result = 2;
155  else if (dfa < 10) result = 1;
156  else result = 0;
157 
158  if (result != 0) result = (result * weights.diff_from_average) / 5;
159  value += result;
160 
161  // evaluate the number of positions where no helix can be built
162  GBDATA *gb_result3 = GB_search(gb_quality_ali, "number_of_no_helix", GB_INT);
163  int noh = GB_read_int(gb_result3);
164 
165  if (noh < 20) result = 5;
166  else if (noh < 50) result = 4;
167  else if (noh < 125) result = 3;
168  else if (noh < 250) result = 2;
169  else if (noh < 500) result = 1;
170  else result = 0;
171 
172  if (result != 0) result = (result * weights.helix) / 5;
173  value += result;
174 
175  // evaluate the consensus
176  GBDATA *gb_result4 = GB_search(gb_quality_ali, "consensus_evaluated", GB_INT);
177  int cos = GB_read_int(gb_result4);
178 
179  result = cos;
180  if (result != 0) result = (result * weights.consensus) / 12;
181  value += result;
182 
183  // evaluate the number of iupacs in a sequence
184  GBDATA *gb_result5 = GB_search(gb_quality_ali, "iupac_value", GB_INT);
185  int iupv = GB_read_int(gb_result5);
186 
187  if (iupv < 1) result = 3;
188  else if (iupv < 5) result = 2;
189  else if (iupv < 10) result = 1;
190  else result = 0;
191 
192  if (result != 0) result = (result * weights.iupac) / 3;
193  value += result;
194 
195  // evaluate the difference in the GC proportion from sequence to group
196  GBDATA *gb_result6 = GB_search(gb_quality_ali, "percent_GC_difference", GB_INT);
197  int gcprop = abs(GB_read_int(gb_result6));
198 
199  if (gcprop < 1) result = 5;
200  else if (gcprop < 2) result = 4;
201  else if (gcprop < 4) result = 3;
202  else if (gcprop < 8) result = 2;
203  else if (gcprop < 16) result = 1;
204  else result = 0;
205 
206  if (result != 0) result = (result * weights.gc) / 5;
207  value += result;
208 
209  // write the final value of the evaluation
210  int value2 = sq_round(value);
211  GBDATA *gb_result7 = GB_search(gb_quality_ali, "evaluation", GB_INT);
212  seq_assert(gb_result7);
213  GB_write_int(gb_result7, value2);
214  }
215  }
216  }
217  }
218 
219  if (!error) error = SQ_add_changekeys(gb_main, alignment_name);
220 
221  free(alignment_name);
222  return error;
223 }
224 
225 static char *SQ_fetch_filtered_sequence(GBDATA *read_sequence, AP_filter *filter) {
226  char *filteredSequence = NULp;
227  if (read_sequence) {
228  const char *rawSequence = GB_read_char_pntr(read_sequence);
229 
230  UNCOVERED(); // @@@ use AP_filter::filter_string here! (need tests first)
231 
232  int filteredLength = filter->get_filtered_length();
233  const size_t *filterpos_2_seqpos = filter->get_filterpos_2_seqpos();
234 
235  ARB_alloc(filteredSequence, filteredLength);
236  if (filteredSequence) {
237  for (int i = 0; i < filteredLength; ++i) {
238  filteredSequence[i] = rawSequence[filterpos_2_seqpos[i]];
239  }
240  }
241  }
242  return filteredSequence;
243 }
244 
245 static GB_ERROR SQ_pass1(SQ_GroupData *globalData, GBDATA *gb_main, TreeNode *node, AP_filter *filter) {
246  GB_ERROR error = NULp;
247  char *alignment_name = GBT_get_default_alignment(gb_main); seq_assert(alignment_name);
248  GBDATA *gb_species = node->gb_node;
249  GBDATA *gb_name = GB_entry(gb_species, "name");
250 
251  if (!gb_name) error = GB_get_error();
252  else {
253  GBDATA *gb_ali = GB_entry(gb_species, alignment_name);
254 
255  if (!gb_ali) {
256  error = no_data_error(gb_species, alignment_name);
257  }
258  else {
259  GBDATA *gb_quality = GB_search(gb_species, "quality", GB_CREATE_CONTAINER);
260 
261  if (!gb_quality) {
262  error = GB_get_error();
263  }
264 
265  GBDATA *read_sequence = GB_entry(gb_ali, "data");
266  GBDATA *gb_quality_ali = GB_search(gb_quality, alignment_name, GB_CREATE_CONTAINER);
267 
268  if (!gb_quality_ali) error = GB_get_error();
269 
270  // real calculations start here
271  if (read_sequence) {
272  char *rawSequence = SQ_fetch_filtered_sequence(read_sequence, filter);
273  int sequenceLength = filter->get_filtered_length();
274 
275  // calculate physical layout of sequence
276  {
277  SQ_physical_layout ps_chan;
278  ps_chan.SQ_calc_physical_layout(rawSequence, sequenceLength, gb_quality_ali);
279 
280  // calculate the average number of bases in group
281  globalData->SQ_count_sequences();
282  globalData->SQ_set_avg_bases(ps_chan.SQ_get_number_of_bases());
283  globalData->SQ_set_avg_gc(ps_chan.SQ_get_gc_proportion());
284  }
285 
286  // get values for ambiguities
287  {
288  SQ_ambiguities ambi_chan;
289  ambi_chan.SQ_count_ambiguities(rawSequence, sequenceLength, gb_quality_ali);
290  }
291 
292  // calculate the number of strong, weak and no helixes
293  {
294  SQ_helix heli_chan(sequenceLength);
295  heli_chan.SQ_calc_helix_layout(rawSequence, gb_main, alignment_name, gb_quality_ali, filter);
296  }
297 
298  // calculate consensus sequence
299  {
300  if (!globalData->SQ_is_initialized()) {
301  globalData->SQ_init_consensus(sequenceLength);
302  }
303  globalData->SQ_add_sequence(rawSequence);
304  }
305 
306  free(rawSequence);
307  }
308  }
309  }
310 
311  free(alignment_name);
312 
313  seq_assert(error || globalData->getSize()>0);
314 
315  return error;
316 }
317 
319  GBDATA *read_sequence = NULp;
320  char *alignment_name = GBT_get_default_alignment(gb_main); seq_assert(alignment_name);
321  GB_ERROR error = NULp;
322 
323  // first pass operations
324  for (GBDATA *gb_species = GBT_first_species(gb_main); gb_species && !error; gb_species = GBT_next_species(gb_species)) {
325  GBDATA *gb_name = GB_entry(gb_species, "name");
326 
327  if (!gb_name) {
328  error = GB_get_error();
329  }
330  else {
331  GBDATA *gb_ali = GB_entry(gb_species, alignment_name);
332 
333  if (!gb_ali) {
334  error = no_data_error(gb_species, alignment_name);
335  }
336  else {
337  GBDATA *gb_quality = GB_search(gb_species, "quality", GB_CREATE_CONTAINER);
338  if (!gb_quality) {
339  error = GB_get_error();
340  }
341 
342  read_sequence = GB_entry(gb_ali, "data");
343 
344  GBDATA *gb_quality_ali = GB_search(gb_quality, alignment_name, GB_CREATE_CONTAINER);
345  if (!gb_quality_ali)
346  error = GB_get_error();
347 
348  // real calculations start here
349  if (read_sequence) {
350  char *rawSequence = SQ_fetch_filtered_sequence(read_sequence, filter);
351  int sequenceLength = filter->get_filtered_length();
352 
353  // calculate physical layout of sequence
354  SQ_physical_layout *ps_chan = new SQ_physical_layout;
355  ps_chan->SQ_calc_physical_layout(rawSequence, sequenceLength, gb_quality_ali);
356 
357  // calculate the average number of bases in group
358  globalData->SQ_count_sequences();
359  globalData->SQ_set_avg_bases(ps_chan->SQ_get_number_of_bases());
360  globalData->SQ_set_avg_gc(ps_chan->SQ_get_gc_proportion());
361  delete ps_chan;
362 
363  // get values for ambiguities
364  SQ_ambiguities *ambi_chan = new SQ_ambiguities;
365  ambi_chan->SQ_count_ambiguities(rawSequence, sequenceLength, gb_quality_ali);
366  delete ambi_chan;
367 
368  // calculate the number of strong, weak and no helixes
369  SQ_helix *heli_chan = new SQ_helix(sequenceLength);
370  heli_chan->SQ_calc_helix_layout(rawSequence, gb_main, alignment_name, gb_quality_ali, filter);
371  delete heli_chan;
372 
373  // calculate consensus sequence
374  {
375  if (!globalData->SQ_is_initialized()) {
376  globalData->SQ_init_consensus(sequenceLength);
377  }
378  globalData->SQ_add_sequence(rawSequence);
379  }
380  delete(rawSequence);
381  }
382  }
383  }
384  progress.inc_and_check_user_abort(error);
385  }
386 
387  free(alignment_name);
388  return error;
389 }
390 
391 static GB_ERROR SQ_pass2(const SQ_GroupData *globalData, GBDATA *gb_main, TreeNode *node, AP_filter *filter) {
392  char *alignment_name = GBT_get_default_alignment(gb_main); seq_assert(alignment_name);
393  GBDATA *gb_species = node->gb_node;
394  GBDATA *gb_name = GB_entry(gb_species, "name");
395  GB_ERROR error = NULp;
396 
397  if (!gb_name) error = GB_get_error();
398  else {
399  GBDATA *gb_ali = GB_entry(gb_species, alignment_name);
400 
401  if (!gb_ali) {
402  error = no_data_error(gb_species, alignment_name);
403  }
404  else {
405  GBDATA *gb_quality = GB_search(gb_species, "quality", GB_CREATE_CONTAINER);
406  if (!gb_quality) error = GB_get_error();
407 
408  GBDATA *gb_quality_ali = GB_search(gb_quality, alignment_name, GB_CREATE_CONTAINER);
409  if (!gb_quality_ali) error = GB_get_error();
410 
411  GBDATA *read_sequence = GB_entry(gb_ali, "data");
412 
413  // real calculations start here
414  if (read_sequence) {
415  char *rawSequence = SQ_fetch_filtered_sequence(read_sequence, filter);
416 
417  /*
418  calculate the average number of bases in group, and the difference of
419  a single sequence in group from it
420  */
421  {
422  GBDATA *gb_result1 = GB_search(gb_quality_ali, "number_of_bases", GB_INT);
423  int bases = GB_read_int(gb_result1);
424  int avg_bases = globalData->SQ_get_avg_bases();
425  int diff_percent = 0;
426 
427  if (avg_bases != 0) {
428  double diff = bases - avg_bases;
429  diff = (100 * diff) / avg_bases;
430  diff_percent = sq_round(diff);
431  }
432 
433  GBDATA *gb_result2 = GB_search(gb_quality_ali, "percent_base_deviation", GB_INT);
434  seq_assert(gb_result2);
435  GB_write_int(gb_result2, diff_percent);
436  }
437 
438  /*
439  calculate the average gc proportion in group, and the difference of
440  a single sequence in group from it
441  */
442  {
443  GBDATA *gb_result6 = GB_search(gb_quality_ali, "GC_proportion", GB_FLOAT);
444  double gcp = GB_read_float(gb_result6);
445  double avg_gc = globalData->SQ_get_avg_gc();
446  int diff_percent = 0;
447 
448  if (avg_gc != 0) {
449  double diff = gcp - avg_gc;
450  diff = (100 * diff) / avg_gc;
451  diff_percent = sq_round(diff);
452  }
453 
454  GBDATA *gb_result7 = GB_search(gb_quality_ali, "percent_GC_difference", GB_INT);
455  seq_assert(gb_result7);
456  GB_write_int(gb_result7, diff_percent);
457  }
458 
459  /*
460  get groupnames of visited groups
461  search for name in group dictionary
462  evaluate sequence with group consensus
463  */
464  GBDATA *gb_con = GB_search(gb_quality_ali, "consensus_conformity", GB_CREATE_CONTAINER);
465  if (!gb_con) error = GB_get_error();
466 
467  GBDATA *gb_dev = GB_search(gb_quality_ali, "consensus_deviation", GB_CREATE_CONTAINER);
468  if (!gb_dev) error = GB_get_error();
469 
470  TreeNode *backup = node; // needed?
471  int whilecounter = 0;
472  double eval = 0;
473 
474  while (backup->father) {
475  if (backup->name) {
476  SQ_GroupDataDictionary::iterator GDI = group_dict.find(backup->name);
477  if (GDI != group_dict.end()) {
478  SQ_GroupDataPtr GD_ptr = GDI->second;
479 
480  consensus_result cr = GD_ptr->SQ_calc_consensus(rawSequence);
481 
482  double value1 = cr.conformity;
483  double value2 = cr.deviation;
484  int value3 = GD_ptr->SQ_get_nr_sequences();
485 
486  GBDATA *gb_node_entry = GB_search(gb_con, "name", GB_STRING);
487  seq_assert(gb_node_entry);
488  GB_write_string(gb_node_entry, backup->name);
489 
490  gb_node_entry = GB_search(gb_con, "value", GB_FLOAT); seq_assert(gb_node_entry);
491  GB_write_float(gb_node_entry, value1);
492 
493  gb_node_entry = GB_search(gb_con, "num_species", GB_INT); seq_assert(gb_node_entry);
494  GB_write_int(gb_node_entry, value3);
495 
496  gb_node_entry = GB_search(gb_dev, "name", GB_STRING); seq_assert(gb_node_entry);
497  GB_write_string(gb_node_entry, backup->name);
498 
499  gb_node_entry = GB_search(gb_dev, "value", GB_FLOAT); seq_assert(gb_node_entry);
500  GB_write_float(gb_node_entry, value2);
501 
502  gb_node_entry = GB_search(gb_dev, "num_species", GB_INT); seq_assert(gb_node_entry);
503  GB_write_int(gb_node_entry, value3);
504 
505  // if you parse the upper two values in the evaluate() function cut the following out
506  // for time reasons i do the evaluation here, as i still have the upper two values
507  // -------------cut this-----------------
508 
509  if (value1 > 0.95) eval += 5;
510  else if (value1 > 0.8) eval += 4;
511  else if (value1 > 0.6) eval += 3;
512  else if (value1 > 0.4) eval += 2;
513  else if (value1 > 0.25) eval += 1;
514  else eval += 0;
515 
516  if (value2 > 0.6) eval += 0;
517  else if (value2 > 0.4) eval += 1;
518  else if (value2 > 0.2) eval += 2;
519  else if (value2 > 0.1) eval += 3;
520  else if (value2 > 0.05) eval += 4;
521  else if (value2 > 0.025) eval += 5;
522  else if (value2 > 0.01) eval += 6;
523  else eval += 7;
524 
525  whilecounter++;
526  // ---------to this and scroll down--------
527  }
528  }
529  backup = backup->get_father();
530  }
531 
532  // --------also cut this------
533  int evaluation = 0;
534  if (eval != 0) {
535  eval = eval / whilecounter;
536  evaluation = sq_round(eval);
537  }
538  GBDATA *gb_result5 = GB_search(gb_quality_ali, "consensus_evaluated", GB_INT);
539  seq_assert(gb_result5);
540  GB_write_int(gb_result5, evaluation);
541  // --------end cut this-------
542 
543  free(rawSequence);
544  }
545  }
546  }
547 
548  free(alignment_name);
549  return error;
550 }
551 
552 GB_ERROR SQ_pass2_no_tree(const SQ_GroupData *globalData, GBDATA *gb_main, AP_filter *filter, arb_progress& progress) {
553  GBDATA *read_sequence = NULp;
554  char *alignment_name = GBT_get_default_alignment(gb_main); seq_assert(alignment_name);
555 
556  // second pass operations
557  GB_ERROR error = NULp;
558  for (GBDATA *gb_species = GBT_first_species(gb_main); gb_species && !error; gb_species = GBT_next_species(gb_species)) {
559  GBDATA *gb_name = GB_entry(gb_species, "name");
560 
561  if (!gb_name) {
562  error = GB_get_error();
563  }
564  else {
565  GBDATA *gb_ali = GB_entry(gb_species, alignment_name);
566  if (!gb_ali) {
567  error = no_data_error(gb_species, alignment_name);
568  }
569  else {
570  GBDATA *gb_quality = GB_search(gb_species, "quality", GB_CREATE_CONTAINER);
571  if (!gb_quality) error = GB_get_error();
572 
573  GBDATA *gb_quality_ali = GB_search(gb_quality, alignment_name, GB_CREATE_CONTAINER);
574  if (!gb_quality_ali) error = GB_get_error();
575 
576  read_sequence = GB_entry(gb_ali, "data");
577 
578  // real calculations start here
579  if (read_sequence) {
580  const char *rawSequence = SQ_fetch_filtered_sequence(read_sequence, filter);
581 
582  /*
583  calculate the average number of bases in group, and the difference of
584  a single sequence in group from it
585  */
586  {
587  GBDATA *gb_result1 = GB_search(gb_quality_ali, "number_of_bases", GB_INT);
588  int bases = GB_read_int(gb_result1);
589  int avg_bases = globalData->SQ_get_avg_bases();
590  int diff_percent = 0;
591 
592  if (avg_bases != 0) {
593  double diff = bases - avg_bases;
594  diff = (100 * diff) / avg_bases;
595  diff_percent = sq_round(diff);
596  }
597 
598  GBDATA *gb_result2 = GB_search(gb_quality_ali, "percent_base_deviation", GB_INT);
599  seq_assert(gb_result2);
600  GB_write_int(gb_result2, diff_percent);
601  }
602 
603  /*
604  calculate the average gc proportion in group, and the difference of
605  a single sequence in group from it
606  */
607  {
608  GBDATA *gb_result6 = GB_search(gb_quality_ali, "GC_proportion", GB_FLOAT);
609  double gcp = GB_read_float(gb_result6);
610  double avg_gc = globalData->SQ_get_avg_gc();
611  int diff_percent = 0;
612 
613  if (avg_gc != 0) {
614  double diff = gcp - avg_gc;
615  diff = (100 * diff) / avg_gc;
616  diff_percent = sq_round(diff);
617  }
618 
619  GBDATA *gb_result7 = GB_search(gb_quality_ali, "percent_GC_difference", GB_INT);
620  seq_assert(gb_result7);
621  GB_write_int(gb_result7, diff_percent);
622  }
623  /*
624  get groupnames of visited groups
625  search for name in group dictionary
626  evaluate sequence with group consensus
627  */
628  GBDATA *gb_con = GB_search(gb_quality_ali, "consensus_conformity", GB_CREATE_CONTAINER);
629  if (!gb_con) error = GB_get_error();
630 
631  GBDATA *gb_dev = GB_search(gb_quality_ali, "consensus_deviation", GB_CREATE_CONTAINER);
632  if (!gb_dev) error = GB_get_error();
633 
634  consensus_result cr = globalData->SQ_calc_consensus(rawSequence);
635 
636  double value1 = cr.conformity;
637  double value2 = cr.deviation;
638  int value3 = globalData->SQ_get_nr_sequences();
639 
640  GBDATA *gb_node_entry = GB_search(gb_con, "name", GB_STRING);
641  seq_assert(gb_node_entry);
642  GB_write_string(gb_node_entry, "one global consensus");
643 
644  gb_node_entry = GB_search(gb_con, "value", GB_FLOAT); seq_assert(gb_node_entry);
645  GB_write_float(gb_node_entry, value1);
646 
647  gb_node_entry = GB_search(gb_con, "num_species", GB_INT); seq_assert(gb_node_entry);
648  GB_write_int(gb_node_entry, value3);
649 
650  gb_node_entry = GB_search(gb_dev, "name", GB_STRING); seq_assert(gb_node_entry);
651  GB_write_string(gb_node_entry, "one global consensus");
652 
653  gb_node_entry = GB_search(gb_dev, "value", GB_FLOAT); seq_assert(gb_node_entry);
654  GB_write_float(gb_node_entry, value2);
655 
656  gb_node_entry = GB_search(gb_dev, "num_species", GB_INT); seq_assert(gb_node_entry);
657  GB_write_int(gb_node_entry, value3);
658 
659  double eval = 0;
660 
661  // if you parse the upper two values in the evaluate() function cut the following out
662  // for time reasons i do the evaluation here, as i still have the upper two values
663  // -------------cut this-----------------
664 
665  if (value1 > 0.95) eval += 5;
666  else if (value1 > 0.8) eval += 4;
667  else if (value1 > 0.6) eval += 3;
668  else if (value1 > 0.4) eval += 2;
669  else if (value1 > 0.25) eval += 1;
670  else eval += 0;
671 
672  if (value2 > 0.6) eval += 0;
673  else if (value2 > 0.4) eval += 1;
674  else if (value2 > 0.2) eval += 2;
675  else if (value2 > 0.1) eval += 3;
676  else if (value2 > 0.05) eval += 4;
677  else if (value2 > 0.025) eval += 5;
678  else if (value2 > 0.01) eval += 6;
679  else eval += 7;
680 
681  {
682  int evaluation = 0;
683  if (eval != 0) evaluation = sq_round(eval);
684 
685  GBDATA *gb_result5 = GB_search(gb_quality_ali, "consensus_evaluated", GB_INT);
686  seq_assert(gb_result5);
687  GB_write_int(gb_result5, evaluation);
688  }
689  // --------end cut this-------
690  delete(rawSequence);
691  }
692  }
693  }
694  progress.inc_and_check_user_abort(error);
695  }
696 
697  free(alignment_name);
698  return error;
699 }
700 
702  SQ_GroupData *newData = data->clone(); // save actual consensus
703  *newData = *data;
704  group_dict[node->name] = newData; // and link it with an name
705 }
706 
708  GB_ERROR error = NULp;
709 
710  if (node->is_leaf()) {
711  if (node->gb_node) {
712  error = SQ_pass1(data, gb_main, node, filter);
713  }
714  }
715  else {
716  TreeNode *node1 = node->get_leftson();
717  TreeNode *node2 = node->get_rightson();
718 
719  if (node->name) {
720  SQ_GroupData *leftData = NULp;
721  bool parentIsEmpty = false;
722 
723  if (data->getSize() == 0) {
724  parentIsEmpty = true;
725  error = SQ_pass1_on_tree(node1, gb_main, data, filter, progress); // process left branch with empty data
726  }
727  else {
728  leftData = data->clone(); // create new empty SQ_GroupData
729  error = SQ_pass1_on_tree(node1, gb_main, leftData, filter, progress); // process left branch
730  }
731 
732  if (!error) {
733  SQ_GroupData *rightData = data->clone(); // create new empty SQ_GroupData
734  error = SQ_pass1_on_tree(node2, gb_main, rightData, filter, progress); // process right branch
735 
736  if (!error) {
737  if (!parentIsEmpty) data->SQ_add(*leftData); // otherwise already collected into 'data'
738  data->SQ_add(*rightData);
739 
740  create_multi_level_consensus(node, data);
741  }
742  delete rightData;
743  }
744  delete leftData;
745  }
746  else {
747  error = SQ_pass1_on_tree(node1, gb_main, data, filter, progress); // enter left branch
748  if (!error) error = SQ_pass1_on_tree(node2, gb_main, data, filter, progress); // enter right branch
749  }
750  }
751 
752  progress.inc();
753 
754  seq_assert(error || data->getSize()>0);
755 
756  return error;
757 }
758 
760  GB_ERROR error = NULp;
761 
762  if (node->is_leaf()) {
763  if (node->gb_node) {
764  error = SQ_pass2(data, gb_main, node, filter);
765  }
766  }
767  else {
768  error = SQ_pass2_on_tree(node->get_leftson(), gb_main, data, filter, progress);
769  if (!error) error = SQ_pass2_on_tree(node->get_rightson(), gb_main, data, filter, progress);
770  }
771 
772  progress.inc(); // [Note: increment for else-branch not optimal - but we keep it in-sync with SQ_pass1_on_tree]
773 
774  return error;
775 }
776 
777 // marks species that are below threshold "evaluation"
778 GB_ERROR SQ_mark_species(GBDATA *gb_main, int condition, bool marked_only) {
779  int result = 0;
780  GBDATA *read_sequence = NULp;
781  GB_ERROR error = NULp;
782  char *alignment_name = GBT_get_default_alignment(gb_main); seq_assert(alignment_name);
783 
784  species_iterator getFirst = NULp;
785  species_iterator getNext = NULp;
786 
787  if (marked_only) {
788  getFirst = GBT_first_marked_species;
789  getNext = GBT_next_marked_species;
790  }
791  else {
792  getFirst = GBT_first_species;
793  getNext = GBT_next_species;
794  }
795 
796  for (GBDATA *gb_species = getFirst(gb_main); gb_species; gb_species = getNext(gb_species)) {
797  GBDATA *gb_ali = GB_entry(gb_species, alignment_name);
798  bool marked = false;
799 
800  if (gb_ali) {
801  GBDATA *gb_quality = GB_search(gb_species, "quality", GB_CREATE_CONTAINER);
802  if (gb_quality) {
803  read_sequence = GB_entry(gb_ali, "data");
804  if (read_sequence) {
805  GBDATA *gb_quality_ali = GB_search(gb_quality, alignment_name, GB_CREATE_CONTAINER);
806  if (gb_quality_ali) {
807  GBDATA *gb_result1 = GB_search(gb_quality_ali, "evaluation", GB_INT);
808  result = GB_read_int(gb_result1);
809 
810  if (result < condition) marked = true;
811  }
812  }
813  }
814  }
815 
816  if (GB_read_flag(gb_species) != marked) {
817  GB_write_flag(gb_species, marked);
818  }
819  }
820 
821  free(alignment_name);
822  return error;
823 }
824 
826  SQ_TREE_ERROR retval = NONE;
827 
828  if (!node)
829  return MISSING_NODE;
830 
831  if (node->is_leaf()) {
832  if (!node->gb_node)
833  retval = ZOMBIE;
834  }
835  else {
836  retval = SQ_check_tree_structure(node->get_leftson());
837  if (retval == NONE) retval = SQ_check_tree_structure(node->get_rightson());
838  }
839 
840  return retval;
841 }
842 
GB_ERROR GB_get_error()
Definition: arb_msg.cxx:333
static GB_ERROR no_data_error(GBDATA *gb_species, const char *ali_name)
const char * GB_ERROR
Definition: arb_core.h:25
static char * SQ_fetch_filtered_sequence(GBDATA *read_sequence, AP_filter *filter)
#define NONE
Definition: GDE_def.h:34
static GB_ERROR SQ_pass2(const SQ_GroupData *globalData, GBDATA *gb_main, TreeNode *node, AP_filter *filter)
string result
const size_t * get_filterpos_2_seqpos() const
Definition: AP_filter.hxx:91
GB_TYPES type
GBDATA * GBT_first_marked_species(GBDATA *gb_main)
Definition: aditem.cxx:113
long GB_read_int(GBDATA *gbd)
Definition: arbdb.cxx:729
int getSize() const
Definition: SQ_GroupData.h:81
GB_ERROR GB_write_string(GBDATA *gbd, const char *s)
Definition: arbdb.cxx:1387
bool SQ_is_initialized() const
Definition: SQ_GroupData.h:71
GB_ERROR SQ_add_changekeys(GBDATA *gb_main, const char *alignment_name)
GB_ERROR SQ_pass2_on_tree(TreeNode *node, GBDATA *gb_main, const SQ_GroupData *data, AP_filter *filter, arb_progress &progress)
const char * GBS_global_string(const char *templat,...)
Definition: arb_msg.cxx:203
virtual void SQ_add_sequence(const char *sequence)=0
static char * alignment_name
STL namespace.
void SQ_calc_physical_layout(const char *sequence, int size, GBDATA *gb_quality_ali)
void SQ_count_ambiguities(const char *iupac, int length, GBDATA *gb_quality)
static int sq_round(double value)
void SQ_count_sequences()
Definition: SQ_GroupData.h:65
std::map< std::string, SQ_GroupDataPtr > SQ_GroupDataDictionary
Definition: SQ_functions.h:42
GB_ERROR GB_delete(GBDATA *&source)
Definition: arbdb.cxx:1916
virtual consensus_result SQ_calc_consensus(const char *sequence) const =0
GB_ERROR GBT_add_new_species_changekey(GBDATA *gb_main, const char *name, GB_TYPES type)
static int diff(int v1, int v2, int v3, int v4, int st, int en)
Definition: ClustalV.cxx:534
Definition: arbdb.h:67
TYPE * ARB_alloc(size_t nelem)
Definition: arb_mem.h:56
void SQ_calc_helix_layout(const char *sequence, GBDATA *gb_main, char *alignment_name, GBDATA *gb_quality, AP_filter *filter)
Definition: SQ_helix.h:97
virtual SQ_GroupData * clone() const =0
int SQ_get_avg_bases() const
Definition: SQ_GroupData.h:56
static void create_multi_level_consensus(TreeNode *node, SQ_GroupData *data)
GB_ERROR SQ_mark_species(GBDATA *gb_main, int condition, bool marked_only)
Generic smart pointer.
Definition: smartptr.h:149
GB_ERROR SQ_pass2_no_tree(const SQ_GroupData *globalData, GBDATA *gb_main, AP_filter *filter, arb_progress &progress)
void SQ_clear_group_dictionary()
static int weights[MAX_BASETYPES][MAX_BASETYPES]
Definition: ClustalV.cxx:71
void SQ_set_avg_bases(int bases)
Definition: SQ_GroupData.h:53
TreeNode * father
Definition: TreeNode.h:171
static void error(const char *msg)
Definition: mkptypes.cxx:96
GB_ERROR SQ_pass1_on_tree(TreeNode *node, GBDATA *gb_main, SQ_GroupData *data, AP_filter *filter, arb_progress &progress)
GBDATA * GBT_next_marked_species(GBDATA *gb_species)
Definition: aditem.cxx:116
CONSTEXPR_INLINE_Cxx14 void swap(unsigned char &c1, unsigned char &c2)
Definition: ad_io_inline.h:19
float GB_read_float(GBDATA *gbd)
Definition: arbdb.cxx:744
SQ_TREE_ERROR
Definition: SQ_functions.h:80
int GB_read_flag(GBDATA *gbd)
Definition: arbdb.cxx:2796
virtual void SQ_add(const SQ_GroupData &other)=0
#define seq_assert(bed)
Definition: arbdb.h:86
GB_ERROR GB_write_float(GBDATA *gbd, float f)
Definition: arbdb.cxx:1281
GB_ERROR GB_write_int(GBDATA *gbd, long i)
Definition: arbdb.cxx:1250
size_t get_filtered_length() const
Definition: AP_filter.hxx:82
GB_ERROR SQ_pass1_no_tree(SQ_GroupData *globalData, GBDATA *gb_main, AP_filter *filter, arb_progress &progress)
double SQ_get_gc_proportion() const
int SQ_get_number_of_bases() const
static GB_ERROR SQ_pass1(SQ_GroupData *globalData, GBDATA *gb_main, TreeNode *node, AP_filter *filter)
Definition: arbdb.h:63
bool is_leaf() const
Definition: TreeNode.h:211
int SQ_get_nr_sequences() const
Definition: SQ_GroupData.h:68
GB_ERROR close(GB_ERROR error)
Definition: arbdbpp.cxx:35
void GB_write_flag(GBDATA *gbd, long flag)
Definition: arbdb.cxx:2773
GB_ERROR SQ_evaluate(GBDATA *gb_main, const SQ_weights &weights, bool marked_only)
char * name
Definition: TreeNode.h:174
virtual void SQ_init_consensus(int size)=0
GBDATA * GBT_first_species(GBDATA *gb_main)
Definition: aditem.cxx:124
#define abs(x)
Definition: f2c.h:151
void SQ_set_avg_gc(double gc)
Definition: SQ_GroupData.h:59
GBDATA * GBT_next_species(GBDATA *gb_species)
Definition: aditem.cxx:128
#define NULp
Definition: cxxforward.h:116
double SQ_get_avg_gc() const
Definition: SQ_GroupData.h:62
static SQ_GroupDataDictionary group_dict
SQ_TREE_ERROR SQ_check_tree_structure(TreeNode *node)
char * GBT_get_default_alignment(GBDATA *gb_main)
Definition: adali.cxx:747
GB_TYPES
Definition: arbdb.h:62
GB_transaction ta(gb_var)
GB_CSTR GB_read_char_pntr(GBDATA *gbd)
Definition: arbdb.cxx:904
GBDATA * gb_node
Definition: TreeNode.h:173
GBDATA * gb_main
Definition: adname.cxx:32
GBDATA *(* species_iterator)(GBDATA *)
GBDATA * GB_search(GBDATA *gbd, const char *fieldpath, GB_TYPES create)
Definition: adquery.cxx:531
int diff_from_average
Definition: SQ_functions.h:59
GB_ERROR SQ_remove_quality_entries(GBDATA *gb_main)
GBDATA * GB_entry(GBDATA *father, const char *key)
Definition: adquery.cxx:334
void inc_and_check_user_abort(GB_ERROR &error)
Definition: arb_progress.h:332
#define UNCOVERED()
Definition: arb_assert.h:380
Definition: arbdb.h:66