ARB
ST_quality.cxx
Go to the documentation of this file.
1 // =============================================================== //
2 // //
3 // File : ST_quality.cxx //
4 // Purpose : //
5 // //
6 // Institute of Microbiology (Technical University Munich) //
7 // http://www.arb-home.de/ //
8 // //
9 // =============================================================== //
10 
11 #include "st_quality.hxx"
12 #include "st_ml.hxx"
13 #include "MostLikelySeq.hxx"
14 
15 #include <ColumnStat.hxx>
16 #include <BI_helix.hxx>
17 #include <AP_filter.hxx>
18 #include <arb_progress.h>
19 #include <arbdbt.h>
20 #include <arb_strbuf.h>
21 
22 #include <cctype>
23 #include <cmath>
24 #include <items.h>
25 
26 // --------------------------
27 // LikelihoodRanges
28 
29 LikelihoodRanges::LikelihoodRanges(size_t no_of_ranges) {
30  ranges = no_of_ranges;
31  likelihood = new VarianceSampler[ranges];
32 }
33 
35  VarianceSampler allRange;
36  for (size_t range = 0; range<ranges; ++range) {
37  allRange.add(likelihood[range]);
38  }
39  return allRange;
40 }
41 
44  char *report = new char[ranges+1];
45 
46  if (allRange.get_count()>1) {
47  double all_median = allRange.get_median();
48  double all_variance = allRange.get_variance(all_median);
49  double all_std_deviation = sqrt(all_variance);
50 
51  for (size_t range = 0; range<ranges; ++range) {
52  size_t count = likelihood[range].get_count();
53  if (count>1) {
54  // perform students-t-test
55 
56  double standard_error = all_std_deviation / sqrt(count); // standard error of mean
57  double median_diff = likelihood[range].get_median() - all_median;
58  double t_value = median_diff / standard_error; // predictand of t_value is 0.0
59 
60  t_values.add(fabs(t_value)); // sample t_values
61 
62  double val = 0.7 * t_value;
63 
64  // val outside [-0.5 .. +0.5] -> "t-test fails"
65  //
66  // 0.7 * t_value = approx. t_value / 1.428;
67  //
68  // 1.428 scheint ein fest-kodiertes quantil zu sein
69  // (bzw. dessen Haelfte, da ja nur der Range [-0.5 .. +0.5] getestet wird)
70  // Das eigentliche Quantil ist also ca. 2.856
71  //
72  // Eigentlich haengt das Quantil von der Stichprobengroesse (hier "count",
73  // d.h. "Anzahl der Basen im aktuellen Range") ab.
74  // (vgl. http://de.wikipedia.org/wiki/T-Verteilung#Ausgew.C3.A4hlte_Quantile_der_t-Verteilung )
75 
76  int ival = int (val + .5) + 5;
77 
78  if (ival > 9) ival = 9;
79  if (ival < 0) ival = 0;
80 
81  report[range] = (ival == 5) ? '-' : '0'+ival;
82  }
83  else {
84  report[range] = '.';
85  }
86  }
87  }
88  else {
89  for (size_t range = 0; range<ranges; ++range) {
90  report[range] = '.';
91  }
92  }
93 
94  report[ranges] = 0;
95 
96  return report;
97 }
98 
99 // --------------------------
100 // ColumnQualityInfo
101 
102 ColumnQualityInfo::ColumnQualityInfo(int seq_len, int bucket_size) :
103  stat_half(2),
104  stat_five(5),
105  stat_user(seq_len / bucket_size + 1),
106  stat_cons(2)
107 {}
108 
109 #if defined(WARN_TODO)
110 #warning stat_cons unused - implement
111 #endif
112 
114  const char *species_name, int seq_len, int bucket_size,
115  GB_HASH *species_to_info_hash, int start, int end)
116 {
117  AP_tree *node = STAT_find_node_by_name(st_ml, species_name);
118  if (node) {
119  MostLikelySeq *sml = st_ml->get_ml_vectors(NULp, node, start, end);
120  if (sml) {
122  if (start > 0) {
123  info = (ColumnQualityInfo *) GBS_read_hash(species_to_info_hash, species_name);
124  }
125  else {
126  info = new ColumnQualityInfo(seq_len, bucket_size);
127  GBS_write_hash(species_to_info_hash, species_name, long (info));
128  }
129 
130  int pos;
131  const char *source_sequence = NULp;
132  int source_sequence_len = 0;
133 
134  GBDATA *gb_data = sml->get_bound_species_data();
135  if (gb_data) {
136  source_sequence_len = GB_read_string_count(gb_data);
137  source_sequence = GB_read_char_pntr(gb_data);
138  }
139  if (end > source_sequence_len) end = source_sequence_len;
140 
141  ST_base_vector *vec = sml->tmp_out + start;
142  for (pos = start; pos < end; vec++, pos++) {
143  DNA_Base base = dna_table.char_to_enum(source_sequence[pos]);
144  if (base != ST_UNKNOWN && base != ST_GAP) { // don't count gaps
145  ST_FLOAT max = vec->max_frequency();
146 
147  double val = max / (0.0001 + vec->b[base]);
148  double log_val = log(val);
149  double seq_rel_pos = double(pos)/seq_len;
150 
151  info->stat_half.add_relative(seq_rel_pos, log_val);
152  info->stat_five.add_relative(seq_rel_pos, log_val);
153  info->stat_user.add_relative(seq_rel_pos, log_val);
154  }
155  }
156  }
157  }
158 }
159 
161  const char *alignment_name, const char *species_name,
162  size_t bucket_size, GB_HASH *species_to_info_hash, st_report_enum report,
163  const char *dest_field)
164 {
165  GB_ERROR error = NULp;
166  GBDATA *gb_species = GBT_find_species(gb_main, species_name);
167  if (!gb_species) error = GBS_global_string("Unknown species '%s'", species_name);
168  else {
169  ColumnQualityInfo *info = (ColumnQualityInfo *) GBS_read_hash(species_to_info_hash, species_name);
170  if (!info) error = GBS_global_string("Statistic missing for species '%s'", species_name);
171  else {
172  GBDATA *gb_dest = GBT_searchOrCreate_itemfield_according_to_changekey(gb_species, dest_field, SPECIES_get_selector().change_key_path);
173  if (!gb_dest) error = GB_await_error();
174  else {
175  Sampler t_values; // sample t-values here
176  char *user_str = info->stat_user.generate_string(t_values);
177  {
178  char *half_str = info->stat_half.generate_string(t_values);
179  char *five_str = info->stat_five.generate_string(t_values);
180 
181  GBS_strstruct *buffer = GBS_stropen(2*9 + 3*2 + (2+5+info->overall_range_count()));
182 
183 #if 1
184  GBS_strcat(buffer, GBS_global_string("%8.3f ", t_values.get_median()));
185  GBS_strcat(buffer, GBS_global_string("%8.3f ", t_values.get_sum()));
186 #else
187 #warning t-value summary disabled for test-purposes
188 #endif
189 
190  GBS_chrcat(buffer, 'a'); GBS_strcat(buffer, half_str); GBS_chrcat(buffer, ' ');
191  GBS_chrcat(buffer, 'b'); GBS_strcat(buffer, five_str); GBS_chrcat(buffer, ' ');
192  GBS_chrcat(buffer, 'c'); GBS_strcat(buffer, user_str);
193 
194  error = GB_write_string(gb_dest, GBS_mempntr(buffer));
195  GBS_strforget(buffer);
196 
197  delete [] half_str;
198  delete [] five_str;
199  }
200 
201  if (!error && report) {
202  // name "quality" handled specially by ../EDIT4/EDB_root_bact.cxx@chimera_check_quality_string
203  GBDATA *gb_report = GBT_add_data(gb_species, alignment_name, "quality", GB_STRING);
204 
205  if (!gb_report) error = GB_await_error();
206  else {
207  char *report_str;
208  {
209  size_t filtered_len = filter->get_filtered_length();
210  report_str = new char[filtered_len + 1];
211 
212  for (size_t i = 0; i < filtered_len; i++) {
213  report_str[i] = user_str[i / bucket_size];
214  }
215  report_str[filtered_len] = 0;
216  }
217 
218  {
219  char *blownUp_report = filter->blowup_string(report_str, ' ');
220  error = GB_write_string(gb_report, blownUp_report);
221  free(blownUp_report);
222  }
223  if (report == ST_QUALITY_REPORT_TEMP) error = GB_set_temporary(gb_report);
224 
225  delete [] report_str;
226  }
227  }
228  delete [] user_str;
229  }
230  }
231  }
232 
233  return error;
234 }
235 
236 static void destroy_ColumnQualityInfo(long cl_info) {
238  delete info;
239 }
240 
242  const char *alignment_name, ColumnStat *colstat, const WeightedFilter *weighted_filter, int bucket_size,
243  int marked_only, st_report_enum report, const char *dest_field)
244 {
245 #if defined(WARN_TODO)
246 #warning parameter 'alignment_name' can be retrieved from WeightedFilter (as soon as automatic mapping works for filters)
247 #endif
248  ST_ML st_ml(gb_main);
249  GB_ERROR error = st_ml.calc_st_ml(tree_name, alignment_name, NULp, marked_only, colstat, weighted_filter);
250 
251  if (!error) {
253  size_t species_count;
254 
255  GB_CSTR *snames = GBT_get_names_of_species_in_tree(st_ml.get_gbt_tree(), &species_count);
256  int seq_len = st_ml.get_filtered_length();
257  size_t parts = (seq_len-1)/ST_MAX_SEQ_PART+1;
258 
259  {
260  arb_progress add_progress("Calculating stat", parts*species_count);
261 
262  for (int pos = 0; pos < seq_len && !error; pos += ST_MAX_SEQ_PART) {
263  int end = pos + ST_MAX_SEQ_PART - 1;
264  if (end > seq_len) end = seq_len;
265 
266  for (GB_CSTR *pspecies_name = snames; *pspecies_name && !error; pspecies_name++) {
267  st_ml_add_sequence_part_to_stat(&st_ml, colstat, *pspecies_name, seq_len, bucket_size, species2info, pos, end);
268  add_progress.inc_and_check_user_abort(error);
269  }
270  }
271  }
272 
273  if (!error) {
274  arb_progress res_progress("Calculating reports", species_count);
275  const AP_filter *filter = st_ml.get_filter();
276 
277  for (GB_CSTR *pspecies_name = snames; *pspecies_name && !error; pspecies_name++) {
278  error = st_ml_add_quality_string_to_species(gb_main, filter, alignment_name, *pspecies_name, bucket_size, species2info, report, dest_field);
279  res_progress.inc_and_check_user_abort(error);
280  }
281  }
282 
283  free(snames);
284  GBS_free_hash(species2info);
285  }
286 
287  return error;
288 }
GB_ERROR calc_st_ml(const char *tree_name, const char *alignment_name, const char *species_names, int marked_only, ColumnStat *colstat, const WeightedFilter *weighted_filter) __ATTR__USERESULT
Definition: ST_ml.cxx:493
const char * GB_ERROR
Definition: arb_core.h:25
void add(double val)
Definition: st_quality.hxx:52
class DNA_Table dna_table
const TreeNode * get_gbt_tree() const
Definition: ST_ml.cxx:873
char * blowup_string(const char *filtered_string, char insert) const
Definition: AP_filter.cxx:222
size_t overall_range_count()
Definition: st_quality.hxx:103
long GBS_write_hash(GB_HASH *hs, const char *key, long val)
Definition: adhash.cxx:457
LikelihoodRanges stat_five
Definition: st_quality.hxx:97
GB_ERROR GB_write_string(GBDATA *gbd, const char *s)
Definition: arbdb.cxx:1385
Definition: st_ml.hxx:54
DNA_Base
Definition: st_ml.hxx:49
static void st_ml_add_sequence_part_to_stat(ST_ML *st_ml, ColumnStat *, const char *species_name, int seq_len, int bucket_size, GB_HASH *species_to_info_hash, int start, int end)
Definition: ST_quality.cxx:113
void add_relative(double seq_rel_pos, double probability)
Definition: st_quality.hxx:84
ColumnQualityInfo(int seq_len, int bucket_size)
Definition: ST_quality.cxx:102
ST_ML * st_ml
const char * GBS_global_string(const char *templat,...)
Definition: arb_msg.cxx:204
GB_HASH * GBS_create_dynaval_hash(long estimated_elements, GB_CASE case_sens, void(*freefun)(long))
Definition: adhash.cxx:271
static char * alignment_name
void GBS_free_hash(GB_HASH *hs)
Definition: adhash.cxx:541
double get_median() const
Definition: st_quality.hxx:37
static ST_base_vector * tmp_out
char buffer[MESSAGE_BUFFERSIZE]
Definition: seq_search.cxx:34
Definition: st_ml.hxx:111
static HelixNrInfo * start
NOT4PERL GBDATA * GBT_add_data(GBDATA *species, const char *ali_name, const char *key, GB_TYPES type) __ATTR__DEPRECATED_TODO("better use GBT_create_sequence_data()")
Definition: adali.cxx:559
GBS_strstruct * GBS_stropen(long init_size)
Definition: arb_strbuf.cxx:39
size_t GB_read_string_count(GBDATA *gbd)
Definition: arbdb.cxx:910
GB_ERROR GB_await_error()
Definition: arb_msg.cxx:353
size_t get_count() const
Definition: st_quality.hxx:35
GBDATA * get_bound_species_data() const
GB_ERROR st_ml_check_sequence_quality(GBDATA *gb_main, const char *tree_name, const char *alignment_name, ColumnStat *colstat, const WeightedFilter *weighted_filter, int bucket_size, int marked_only, st_report_enum report, const char *dest_field)
Definition: ST_quality.cxx:241
size_t get_filtered_length() const
Definition: ST_ml.cxx:890
DNA_Base char_to_enum(char i)
MostLikelySeq * get_ml_vectors(const char *species_name, AP_tree *node, size_t start_ali_pos, size_t end_ali_pos)
Definition: ST_ml.cxx:681
void GBS_strcat(GBS_strstruct *strstr, const char *ptr)
Definition: arb_strbuf.cxx:108
static void error(const char *msg)
Definition: mkptypes.cxx:96
ST_FLOAT max_frequency()
Definition: st_ml.hxx:92
st_report_enum
Definition: st_window.hxx:28
void add(double val)
Definition: st_quality.hxx:26
void GBS_strforget(GBS_strstruct *strstr)
Definition: arb_strbuf.cxx:76
size_t get_filtered_length() const
Definition: AP_filter.hxx:82
ST_FLOAT b[ST_MAX_BASE]
Definition: st_ml.hxx:60
double get_variance(double median) const
Definition: st_quality.hxx:62
static GB_ERROR st_ml_add_quality_string_to_species(GBDATA *gb_main, const AP_filter *filter, const char *alignment_name, const char *species_name, size_t bucket_size, GB_HASH *species_to_info_hash, st_report_enum report, const char *dest_field)
Definition: ST_quality.cxx:160
char * generate_string(Sampler &t_values)
Definition: ST_quality.cxx:42
GBDATA * GBT_searchOrCreate_itemfield_according_to_changekey(GBDATA *gb_item, const char *field_name, const char *change_key_path)
Definition: adChangeKey.cxx:62
GB_ERROR GB_set_temporary(GBDATA *gbd) __ATTR__USERESULT
Definition: arbdb.cxx:2274
void GBS_chrcat(GBS_strstruct *strstr, char ch)
Definition: arb_strbuf.cxx:119
static void destroy_ColumnQualityInfo(long cl_info)
Definition: ST_quality.cxx:236
char * GBS_mempntr(GBS_strstruct *strstr)
Definition: arb_strbuf.cxx:86
LikelihoodRanges stat_half
Definition: st_quality.hxx:96
const AP_filter * get_filter() const
Definition: ST_ml.cxx:889
ItemSelector & SPECIES_get_selector()
Definition: species.cxx:139
LikelihoodRanges stat_user
Definition: st_quality.hxx:98
#define NULp
Definition: cxxforward.h:97
GBDATA * GBT_find_species(GBDATA *gb_main, const char *name)
Definition: aditem.cxx:139
GB_CSTR * GBT_get_names_of_species_in_tree(const TreeNode *tree, size_t *count)
Definition: adtree.cxx:1291
AP_tree * STAT_find_node_by_name(ST_ML *st_ml, const char *species_name)
Definition: ST_window.cxx:127
long GBT_get_species_count(GBDATA *gb_main)
Definition: aditem.cxx:207
GB_CSTR GB_read_char_pntr(GBDATA *gbd)
Definition: arbdb.cxx:898
GBDATA * gb_main
Definition: adname.cxx:33
double get_sum() const
Definition: st_quality.hxx:36
VarianceSampler summarize_all_ranges()
Definition: ST_quality.cxx:34
const size_t ST_MAX_SEQ_PART
static int info[maxsites+1]
const char * GB_CSTR
Definition: arbdb_base.h:25
long GBS_read_hash(const GB_HASH *hs, const char *key)
Definition: adhash.cxx:395
void inc_and_check_user_abort(GB_ERROR &error)
Definition: arb_progress.h:274
float ST_FLOAT
Definition: st_ml.hxx:46
LikelihoodRanges(size_t ranges)
Definition: ST_quality.cxx:29
#define max(a, b)
Definition: f2c.h:154