ARB
ST_quality.cxx
Go to the documentation of this file.
1 // =============================================================== //
2 // //
3 // File : ST_quality.cxx //
4 // Purpose : //
5 // //
6 // Institute of Microbiology (Technical University Munich) //
7 // http://www.arb-home.de/ //
8 // //
9 // =============================================================== //
10 
11 #include "st_quality.hxx"
12 #include "st_ml.hxx"
13 #include "MostLikelySeq.hxx"
14 
15 #include <ColumnStat.hxx>
16 #include <BI_helix.hxx>
17 #include <AP_filter.hxx>
18 #include <arb_progress.h>
19 #include <arbdbt.h>
20 #include <arb_strbuf.h>
21 
22 #include <cctype>
23 #include <cmath>
24 #include <items.h>
25 
26 // --------------------------
27 // LikelihoodRanges
28 
29 LikelihoodRanges::LikelihoodRanges(size_t no_of_ranges) {
30  ranges = no_of_ranges;
31  likelihood = new VarianceSampler[ranges];
32 }
33 
35  VarianceSampler allRange;
36  for (size_t range = 0; range<ranges; ++range) {
37  allRange.add(likelihood[range]);
38  }
39  return allRange;
40 }
41 
44  char *report = new char[ranges+1];
45 
46  if (allRange.get_count()>1) {
47  double all_median = allRange.get_median();
48  double all_variance = allRange.get_variance(all_median);
49  double all_std_deviation = sqrt(all_variance);
50 
51  for (size_t range = 0; range<ranges; ++range) {
52  size_t count = likelihood[range].get_count();
53  if (count>1) {
54  // perform students-t-test
55 
56  double standard_error = all_std_deviation / sqrt(count); // standard error of mean
57  double median_diff = likelihood[range].get_median() - all_median;
58  double t_value = median_diff / standard_error; // predictand of t_value is 0.0
59 
60  t_values.add(fabs(t_value)); // sample t_values
61 
62  double val = 0.7 * t_value;
63 
64  // val outside [-0.5 .. +0.5] -> "t-test fails"
65  //
66  // 0.7 * t_value = approx. t_value / 1.428;
67  //
68  // 1.428 scheint ein fest-kodiertes quantil zu sein
69  // (bzw. dessen Haelfte, da ja nur der Range [-0.5 .. +0.5] getestet wird)
70  // Das eigentliche Quantil ist also ca. 2.856
71  //
72  // Eigentlich haengt das Quantil von der Stichprobengroesse (hier "count",
73  // d.h. "Anzahl der Basen im aktuellen Range") ab.
74  // (vgl. http://de.wikipedia.org/wiki/T-Verteilung#Ausgew.C3.A4hlte_Quantile_der_t-Verteilung )
75 
76  int ival = int (val + .5) + 5;
77 
78  if (ival > 9) ival = 9;
79  if (ival < 0) ival = 0;
80 
81  report[range] = (ival == 5) ? '-' : '0'+ival;
82  }
83  else {
84  report[range] = '.';
85  }
86  }
87  }
88  else {
89  for (size_t range = 0; range<ranges; ++range) {
90  report[range] = '.';
91  }
92  }
93 
94  report[ranges] = 0;
95 
96  return report;
97 }
98 
99 // --------------------------
100 // ColumnQualityInfo
101 
102 ColumnQualityInfo::ColumnQualityInfo(int seq_len, int bucket_size) :
103  stat_half(2),
104  stat_five(5),
105  stat_user(seq_len / bucket_size + 1),
106  stat_cons(2) // @@@ stat_cons is unused - implement
107 {}
108 
109 
111  const char *species_name, int seq_len, int bucket_size,
112  GB_HASH *species_to_info_hash, int start, int end)
113 {
114  AP_tree *node = STAT_find_node_by_name(st_ml, species_name);
115  if (node) {
116  MostLikelySeq *sml = st_ml->get_ml_vectors(NULp, node, start, end);
117  if (sml) {
119  if (start > 0) {
120  info = (ColumnQualityInfo *) GBS_read_hash(species_to_info_hash, species_name);
121  }
122  else {
123  info = new ColumnQualityInfo(seq_len, bucket_size);
124  GBS_write_hash(species_to_info_hash, species_name, long (info));
125  }
126 
127  int pos;
128  const char *source_sequence = NULp;
129  int source_sequence_len = 0;
130 
131  GBDATA *gb_data = sml->get_bound_species_data();
132  if (gb_data) {
133  source_sequence_len = GB_read_string_count(gb_data);
134  source_sequence = GB_read_char_pntr(gb_data);
135  }
136  if (end > source_sequence_len) end = source_sequence_len;
137 
138  ST_base_vector *vec = sml->tmp_out + start;
139  for (pos = start; pos < end; vec++, pos++) {
140  DNA_Base base = dna_table.char_to_enum(source_sequence[pos]);
141  if (base != ST_UNKNOWN && base != ST_GAP) { // don't count gaps
142  ST_FLOAT max = vec->max_frequency();
143 
144  double val = max / (0.0001 + vec->b[base]);
145  double log_val = log(val);
146  double seq_rel_pos = double(pos)/seq_len;
147 
148  info->stat_half.add_relative(seq_rel_pos, log_val);
149  info->stat_five.add_relative(seq_rel_pos, log_val);
150  info->stat_user.add_relative(seq_rel_pos, log_val);
151  }
152  }
153  }
154  }
155 }
156 
158  const char *alignment_name, const char *species_name,
159  size_t bucket_size, GB_HASH *species_to_info_hash, st_report_enum report,
160  const char *dest_field)
161 {
162  GB_ERROR error = NULp;
163  GBDATA *gb_species = GBT_find_species(gb_main, species_name);
164  if (!gb_species) error = GBS_global_string("Unknown species '%s'", species_name);
165  else {
166  ColumnQualityInfo *info = (ColumnQualityInfo *) GBS_read_hash(species_to_info_hash, species_name);
167  if (!info) error = GBS_global_string("Statistic missing for species '%s'", species_name);
168  else {
169  GBDATA *gb_dest = GBT_searchOrCreate_itemfield_according_to_changekey(gb_species, dest_field, SPECIES_get_selector().change_key_path);
170  if (!gb_dest) error = GB_await_error();
171  else {
172  Sampler t_values; // sample t-values here
173  char *user_str = info->stat_user.generate_string(t_values);
174  {
175  char *half_str = info->stat_half.generate_string(t_values);
176  char *five_str = info->stat_five.generate_string(t_values);
177 
178  GBS_strstruct buffer(2*9 + 3*2 + (2+5+info->overall_range_count()));
179 
180 #if 1
181  buffer.nprintf(30, "%8.3f ", t_values.get_median());
182  buffer.nprintf(30, "%8.3f ", t_values.get_sum());
183 #else
184 #warning t-value summary disabled for test-purposes
185 #endif
186 
187  buffer.put('a'); buffer.cat(half_str); buffer.put(' ');
188  buffer.put('b'); buffer.cat(five_str); buffer.put(' ');
189  buffer.put('c'); buffer.cat(user_str);
190 
191  error = GB_write_string(gb_dest, buffer.get_data());
192 
193  delete [] half_str;
194  delete [] five_str;
195  }
196 
197  if (!error && report) {
198  // name "quality" handled specially by ../EDIT4/EDB_root_bact.cxx@chimera_check_quality_string
199  GBDATA *gb_report = GBT_add_data(gb_species, alignment_name, "quality", GB_STRING);
200 
201  if (!gb_report) error = GB_await_error();
202  else {
203  char *report_str;
204  {
205  size_t filtered_len = filter->get_filtered_length();
206  report_str = new char[filtered_len + 1];
207 
208  for (size_t i = 0; i < filtered_len; i++) {
209  report_str[i] = user_str[i / bucket_size];
210  }
211  report_str[filtered_len] = 0;
212  }
213 
214  {
215  char *blownUp_report = filter->blowup_string(report_str, ' ');
216  error = GB_write_string(gb_report, blownUp_report);
217  free(blownUp_report);
218  }
219  if (report == ST_QUALITY_REPORT_TEMP) error = GB_set_temporary(gb_report);
220 
221  delete [] report_str;
222  }
223  }
224  delete [] user_str;
225  }
226  }
227  }
228 
229  return error;
230 }
231 
232 static void destroy_ColumnQualityInfo(long cl_info) {
234  delete info;
235 }
236 
238  const char *alignment_name, ColumnStat *colstat, const WeightedFilter *weighted_filter, int bucket_size,
239  int marked_only, st_report_enum report, const char *dest_field)
240 {
241  // @@@ parameter 'alignment_name' can be retrieved from WeightedFilter (as soon as automatic mapping works for filters)
242 
243  ST_ML st_ml(gb_main);
244  GB_ERROR error = st_ml.calc_st_ml(tree_name, alignment_name, NULp, marked_only, colstat, weighted_filter);
245 
246  if (!error) {
248  size_t species_count;
249 
250  GB_CSTR *snames = GBT_get_names_of_species_in_tree(st_ml.get_gbt_tree(), &species_count);
251  int seq_len = st_ml.get_filtered_length();
252  size_t parts = (seq_len-1)/ST_MAX_SEQ_PART+1;
253 
254  {
255  arb_progress add_progress("Calculating stat", parts*species_count);
256 
257  for (int pos = 0; pos < seq_len && !error; pos += ST_MAX_SEQ_PART) {
258  int end = pos + ST_MAX_SEQ_PART - 1;
259  if (end > seq_len) end = seq_len;
260 
261  for (GB_CSTR *pspecies_name = snames; *pspecies_name && !error; pspecies_name++) {
262  st_ml_add_sequence_part_to_stat(&st_ml, colstat, *pspecies_name, seq_len, bucket_size, species2info, pos, end);
263  add_progress.inc_and_check_user_abort(error);
264  }
265  }
266  }
267 
268  if (!error) {
269  arb_progress res_progress("Calculating reports", species_count);
270  const AP_filter *filter = st_ml.get_filter();
271 
272  for (GB_CSTR *pspecies_name = snames; *pspecies_name && !error; pspecies_name++) {
273  error = st_ml_add_quality_string_to_species(gb_main, filter, alignment_name, *pspecies_name, bucket_size, species2info, report, dest_field);
274  res_progress.inc_and_check_user_abort(error);
275  }
276  }
277 
278  free(snames);
279  GBS_free_hash(species2info);
280  }
281 
282  return error;
283 }
GB_ERROR calc_st_ml(const char *tree_name, const char *alignment_name, const char *species_names, int marked_only, ColumnStat *colstat, const WeightedFilter *weighted_filter) __ATTR__USERESULT
Definition: ST_ml.cxx:491
const char * GB_ERROR
Definition: arb_core.h:25
void add(double val)
Definition: st_quality.hxx:52
class DNA_Table dna_table
const TreeNode * get_gbt_tree() const
Definition: ST_ml.cxx:871
char * blowup_string(const char *filtered_string, char insert) const
Definition: AP_filter.cxx:222
size_t overall_range_count()
Definition: st_quality.hxx:103
long GBS_write_hash(GB_HASH *hs, const char *key, long val)
Definition: adhash.cxx:454
LikelihoodRanges stat_five
Definition: st_quality.hxx:97
GB_ERROR GB_write_string(GBDATA *gbd, const char *s)
Definition: arbdb.cxx:1387
Definition: st_ml.hxx:54
DNA_Base
Definition: st_ml.hxx:49
static void st_ml_add_sequence_part_to_stat(ST_ML *st_ml, ColumnStat *, const char *species_name, int seq_len, int bucket_size, GB_HASH *species_to_info_hash, int start, int end)
Definition: ST_quality.cxx:110
void add_relative(double seq_rel_pos, double probability)
Definition: st_quality.hxx:84
ColumnQualityInfo(int seq_len, int bucket_size)
Definition: ST_quality.cxx:102
ST_ML * st_ml
const char * GBS_global_string(const char *templat,...)
Definition: arb_msg.cxx:203
GB_HASH * GBS_create_dynaval_hash(long estimated_elements, GB_CASE case_sens, void(*freefun)(long))
Definition: adhash.cxx:271
static char * alignment_name
void GBS_free_hash(GB_HASH *hs)
Definition: adhash.cxx:538
double get_median() const
Definition: st_quality.hxx:37
void cat(const char *from)
Definition: arb_strbuf.h:183
static ST_base_vector * tmp_out
char buffer[MESSAGE_BUFFERSIZE]
Definition: seq_search.cxx:34
Definition: st_ml.hxx:111
static HelixNrInfo * start
NOT4PERL GBDATA * GBT_add_data(GBDATA *species, const char *ali_name, const char *key, GB_TYPES type) __ATTR__DEPRECATED_TODO("better use GBT_create_sequence_data()")
Definition: adali.cxx:597
size_t GB_read_string_count(GBDATA *gbd)
Definition: arbdb.cxx:916
GB_ERROR GB_await_error()
Definition: arb_msg.cxx:342
size_t get_count() const
Definition: st_quality.hxx:35
GBDATA * get_bound_species_data() const
GB_ERROR st_ml_check_sequence_quality(GBDATA *gb_main, const char *tree_name, const char *alignment_name, ColumnStat *colstat, const WeightedFilter *weighted_filter, int bucket_size, int marked_only, st_report_enum report, const char *dest_field)
Definition: ST_quality.cxx:237
size_t get_filtered_length() const
Definition: ST_ml.cxx:888
DNA_Base char_to_enum(char i)
MostLikelySeq * get_ml_vectors(const char *species_name, AP_tree *node, size_t start_ali_pos, size_t end_ali_pos)
Definition: ST_ml.cxx:679
static void error(const char *msg)
Definition: mkptypes.cxx:96
ST_FLOAT max_frequency()
Definition: st_ml.hxx:92
st_report_enum
Definition: st_window.hxx:28
void add(double val)
Definition: st_quality.hxx:26
size_t get_filtered_length() const
Definition: AP_filter.hxx:82
ST_FLOAT b[ST_MAX_BASE]
Definition: st_ml.hxx:60
double get_variance(double median) const
Definition: st_quality.hxx:62
static GB_ERROR st_ml_add_quality_string_to_species(GBDATA *gb_main, const AP_filter *filter, const char *alignment_name, const char *species_name, size_t bucket_size, GB_HASH *species_to_info_hash, st_report_enum report, const char *dest_field)
Definition: ST_quality.cxx:157
char * generate_string(Sampler &t_values)
Definition: ST_quality.cxx:42
GBDATA * GBT_searchOrCreate_itemfield_according_to_changekey(GBDATA *gb_item, const char *field_name, const char *change_key_path)
Definition: adChangeKey.cxx:61
GB_ERROR GB_set_temporary(GBDATA *gbd) __ATTR__USERESULT
Definition: arbdb.cxx:2282
static void destroy_ColumnQualityInfo(long cl_info)
Definition: ST_quality.cxx:232
LikelihoodRanges stat_half
Definition: st_quality.hxx:96
const AP_filter * get_filter() const
Definition: ST_ml.cxx:887
void nprintf(size_t maxlen, const char *templat,...) __ATTR__FORMAT_MEMBER(2)
Definition: arb_strbuf.cxx:29
ItemSelector & SPECIES_get_selector()
Definition: species.cxx:139
LikelihoodRanges stat_user
Definition: st_quality.hxx:98
#define NULp
Definition: cxxforward.h:114
GBDATA * GBT_find_species(GBDATA *gb_main, const char *name)
Definition: aditem.cxx:139
GB_CSTR * GBT_get_names_of_species_in_tree(const TreeNode *tree, size_t *count)
Definition: adtree.cxx:1345
const char * get_data() const
Definition: arb_strbuf.h:120
AP_tree * STAT_find_node_by_name(ST_ML *st_ml, const char *species_name)
Definition: ST_window.cxx:127
long GBT_get_species_count(GBDATA *gb_main)
Definition: aditem.cxx:207
GB_CSTR GB_read_char_pntr(GBDATA *gbd)
Definition: arbdb.cxx:904
GBDATA * gb_main
Definition: adname.cxx:32
double get_sum() const
Definition: st_quality.hxx:36
VarianceSampler summarize_all_ranges()
Definition: ST_quality.cxx:34
const size_t ST_MAX_SEQ_PART
static int info[maxsites+1]
const char * GB_CSTR
Definition: arbdb_base.h:25
long GBS_read_hash(const GB_HASH *hs, const char *key)
Definition: adhash.cxx:392
void inc_and_check_user_abort(GB_ERROR &error)
Definition: arb_progress.h:332
float ST_FLOAT
Definition: st_ml.hxx:46
LikelihoodRanges(size_t ranges)
Definition: ST_quality.cxx:29
void put(char c)
Definition: arb_strbuf.h:158
#define max(a, b)
Definition: f2c.h:154