ARB
AP_pos_var.cxx
Go to the documentation of this file.
1 // =============================================================== //
2 // //
3 // File : AP_pos_var.cxx //
4 // Purpose : provides PVP calculation //
5 // //
6 // Institute of Microbiology (Technical University Munich) //
7 // http://www.arb-home.de/ //
8 // //
9 // =============================================================== //
10 
11 #include "AP_pos_var.h"
12 
13 #include <AP_pro_a_nucs.hxx>
14 #include <TreeNode.h>
15 
16 #include <arb_progress.h>
17 #include <arb_strbuf.h>
18 
19 #include <cctype>
20 
21 #define ap_assert(cond) arb_assert(cond)
22 
23 AP_pos_var::AP_pos_var(GBDATA *gb_main_, const char *ali_name_, long ali_len_, bool is_nuc_, const char *tree_name_) :
24  gb_main(gb_main_),
25  treeLeafs(0),
26  treeNodes(0),
27  progress(NULp),
28  transitions(NULp),
29  transversions(NULp),
30  ali_len(ali_len_),
31  ali_name(ARB_strdup(ali_name_)),
32  tree_name(ARB_strdup(tree_name_)),
33  is_nuc(is_nuc_)
34 {
35  for (int i=0; i<256; i++) {
36  frequencies[i] = NULp;
37  char_2_freq[i] = 0;
38  char_2_transition[i] = 0;
39  char_2_transition[i] = 0;
40  }
41 }
42 
44  delete progress;
45  free(ali_name);
46  free(tree_name);
47  free(transitions);
48  free(transversions);
49  for (int i=0; i<256; i++) free(frequencies[i]);
50 }
51 
52 const char *AP_pos_var::parsimony(TreeNode *tree, GB_UINT4 *bases, GB_UINT4 *tbases) {
54 
55  if (tree->is_leaf()) {
56  if (tree->gb_node) {
57  GBDATA *gb_data = GBT_find_sequence(tree->gb_node, ali_name);
58  if (gb_data) {
59  size_t seq_len = ali_len;
60  if (GB_read_string_count(gb_data) < seq_len) {
61  seq_len = GB_read_string_count(gb_data);
62  }
63 
64  unsigned char *sequence = (unsigned char*)GB_read_char_pntr(gb_data);
65  for (size_t i = 0; i< seq_len; i++) {
66  long L = char_2_freq[sequence[i]];
67  if (L) {
68  ap_assert(frequencies[L]);
69  frequencies[L][i]++;
70  }
71  }
72 
73  if (bases) {
74  for (size_t i = 0; i< seq_len; i++) bases[i] = char_2_transition[sequence[i]];
75  }
76  if (tbases) {
77  for (size_t i = 0; i< seq_len; i++) tbases[i] = char_2_transversion[sequence[i]];
78  }
79  }
80  }
81  }
82  else {
83  GB_UINT4 *ls = ARB_calloc<GB_UINT4>(ali_len);
84  GB_UINT4 *rs = ARB_calloc<GB_UINT4>(ali_len);
85  GB_UINT4 *lts = ARB_calloc<GB_UINT4>(ali_len);
86  GB_UINT4 *rts = ARB_calloc<GB_UINT4>(ali_len);
87 
88  if (!error) error = this->parsimony(tree->get_leftson(), ls, lts);
89  if (!error) error = this->parsimony(tree->get_rightson(), rs, rts);
90  if (!error) {
91  for (long i=0; i< ali_len; i++) {
92  long l = ls[i];
93  long r = rs[i];
94  if (l & r) {
95  if (bases) bases[i] = l&r;
96  }
97  else {
98  transitions[i] ++;
99  if (bases) bases[i] = l|r;
100  }
101  l = lts[i];
102  r = rts[i];
103  if (l & r) {
104  if (tbases) tbases[i] = l&r;
105  }
106  else {
107  transversions[i] ++;
108  if (tbases) tbases[i] = l|r;
109  }
110  }
111  }
112 
113  free(lts);
114  free(rts);
115 
116  free(ls);
117  free(rs);
118  }
119  progress->inc_and_check_user_abort(error);
120  return error;
121 }
122 
123 
124 // Calculate the positional variability: control procedure
126  ap_assert(!treeNodes); // calling retrieve() multiple times is untested
127 
128  GB_ERROR error = NULp;
129  if (!tree) {
130  error = "No tree specified for PVP calculation";
131  }
132  else {
133  treeLeafs = GBT_count_leafs(tree);
134  treeNodes = leafs_2_nodes(treeLeafs, ROOTED); // used for progress
135 
136  ap_assert(treeNodes>0);
137 
138  if (is_nuc) {
139  char_2_freq[(unsigned char)'a'] = 'A';
140  char_2_freq[(unsigned char)'A'] = 'A';
141  char_2_freq[(unsigned char)'c'] = 'C';
142  char_2_freq[(unsigned char)'C'] = 'C';
143  char_2_freq[(unsigned char)'g'] = 'G';
144  char_2_freq[(unsigned char)'G'] = 'G';
145  char_2_freq[(unsigned char)'t'] = 'U';
146  char_2_freq[(unsigned char)'T'] = 'U';
147  char_2_freq[(unsigned char)'u'] = 'U';
148  char_2_freq[(unsigned char)'U'] = 'U';
149 
150  unsigned char *char_2_bitstring = (unsigned char *)AP_create_dna_to_ap_bases();
151 
152  for (int i=0; i<256; i++) {
153  int j;
154  if (i=='-') j = '.'; else j = i;
155  long base = char_2_transition[i] = char_2_bitstring[j];
156  char_2_transversion[i] = 0;
157  if (base & (AP_A | AP_G)) char_2_transversion[i] = 1;
158  if (base & (AP_C | AP_T)) char_2_transversion[i] |= 2;
159  }
160  delete [] char_2_bitstring;
161  }
162  else {
163  AWT_translator *translator = AWT_get_user_translator(gb_main);
164 
165  long char_2_bitstring[256];
166  {
167  for (int i=0; i<256; ++i) {
168  char_2_bitstring[i] = 0;
169  }
170  int aa_max = translator->MaxAA();
171  for (int i = 0; i<=aa_max; ++i) {
172  long bs = translator->index2bitset(i);
173  unsigned char spro = translator->index2spro(i);
174 
175  char_2_bitstring[spro] = bs;
176  }
177  }
178 
179  for (int i=0; i<256; ++i) {
180  char_2_transversion[i] = 0;
181  long base = char_2_transition[i] = char_2_bitstring[i];
182  if (base) char_2_freq[i] = toupper(i);
183  }
184  }
185 
186  progress = new arb_progress(treeNodes);
187 
188  for (int i=0; i<256; i++) {
189  int j = char_2_freq[i];
190  if (j && !frequencies[j]) ARB_calloc(frequencies[j], ali_len);
191  }
192 
193  ARB_calloc(transitions, ali_len);
194  ARB_calloc(transversions, ali_len);
195 
196  error = this->parsimony(tree);
197  }
198 
199  return error;
200 }
201 
203  // deletes existing alignment sub-container from SAI 'sai_name'
204 
205  GBDATA *gb_extended = GBT_find_SAI(gb_main, sai_name);
206  if (gb_extended) {
207  GBDATA *gb_ali = GB_search(gb_extended, ali_name, GB_FIND);
208  if (gb_ali) {
209  return GB_delete(gb_ali);
210  }
211  }
212  return NULp; // sai/ali did not exist
213 }
214 
216  // save whole SAI
217  // or
218  // add alignment sub-container to existing SAI
219 
220  GB_ERROR error = NULp;
221  GBDATA *gb_extended = GBT_find_or_create_SAI(gb_main, sai_name);
222 
223  if (!gb_extended) error = GB_await_error();
224  else {
225  GBDATA *gb_ali = GB_search(gb_extended, ali_name, GB_DB);
226  if (!gb_ali) error = GB_await_error();
227  else {
228  const char *description =
229  GBS_global_string("PVP: Positional Variability by Parsimony: tree '%s' ntaxa %li",
230  tree_name, treeLeafs);
231 
232  error = GBT_write_string(gb_ali, "_TYPE", description);
233  }
234 
235  if (!error) {
236  char *data = ARB_calloc<char>(ali_len+1);
237  int *sum = ARB_calloc<int>(ali_len);
238 
239  for (int j=0; j<256 && !error; j++) { // get sum of frequencies
240  if (frequencies[j]) {
241  for (int i=0; i<ali_len; i++) { // LOOP_VECTORIZED
242  sum[i] += frequencies[j][i];
243  }
244 
245  if (j >= 'A' && j <= 'Z') {
246  GBDATA *gb_freq = GB_search(gb_ali, GBS_global_string("FREQUENCIES/N%c", j), GB_INTS);
247  if (!gb_freq) error = GB_await_error();
248  else error = GB_write_ints(gb_freq, frequencies[j], ali_len);
249  }
250  }
251  }
252 
253  if (!error) {
254  GBDATA *gb_transi = GB_search(gb_ali, "FREQUENCIES/TRANSITIONS", GB_INTS);
255  if (!gb_transi) error = GB_await_error();
256  else error = GB_write_ints(gb_transi, transitions, ali_len);
257  }
258  if (!error) {
259  GBDATA *gb_transv = GB_search(gb_ali, "FREQUENCIES/TRANSVERSIONS", GB_INTS);
260  if (!gb_transv) error = GB_await_error();
261  else error = GB_write_ints(gb_transv, transversions, ali_len);
262  }
263 
264  if (!error) {
265  int max_categ = 0;
266  double logbase = sqrt(2.0);
267  double lnlogbase = log(logbase);
268  double b = .75;
269  double max_rate = 1.0;
270 
271  int tenPercentOfLeafs = treeLeafs*0.1;
272  if ((10*tenPercentOfLeafs) < treeLeafs) ++tenPercentOfLeafs;
273 
274  for (int i=0; i<ali_len; i++) {
275  if (sum[i] < tenPercentOfLeafs) { // less than 10% valid characters; as documented in ../../HELP_SOURCE/source/pos_var_pars.hlp@valid
276  data[i] = '.';
277  continue;
278  }
279  if (transitions[i] == 0) {
280  data[i] = '-';
281  continue;
282  }
283  double rate = transitions[i] / (double)sum[i];
284  if (rate >= b * .95) {
285  rate = b * .95;
286  }
287  rate = -b * log(1-rate/b);
288  if (rate > max_rate) rate = max_rate;
289  rate /= max_rate; // scaled 1.0 == fast rate
290  // ~0.0 slow rate
291  double dcat = -log(rate)/lnlogbase;
292  int icat = (int)dcat;
293  if (icat > 35) icat = 35;
294  if (icat >= max_categ) max_categ = icat + 1;
295  data[i] = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"[icat];
296  }
297 
298  error = GBT_write_string(gb_ali, "data", data);
299 
300  if (!error) {
301  // Generate Categories
302  GBS_strstruct out(1000);
303  for (int i = 0; i<max_categ; i++) {
304  out.putfloat(pow(1.0/logbase, i));
305  out.put(' ');
306  }
307 
308  error = GBT_write_string(gb_ali, "_CATEGORIES", out.get_data());
309  }
310  }
311 
312  free(sum);
313  free(data);
314  }
315  }
316 
317  return error;
318 }
319 
Definition: arbdbt.h:48
const char * GB_ERROR
Definition: arb_core.h:25
size_t GBT_count_leafs(const TreeNode *tree)
Definition: adtree.cxx:842
char * AP_create_dna_to_ap_bases()
#define ap_assert(cond)
Definition: AP_pos_var.cxx:21
char * ARB_strdup(const char *str)
Definition: arb_string.h:27
const char * GBS_global_string(const char *templat,...)
Definition: arb_msg.cxx:203
int MaxAA() const
GB_ERROR GB_delete(GBDATA *&source)
Definition: arbdb.cxx:1916
GBDATA * GBT_find_SAI(GBDATA *gb_main, const char *name)
Definition: aditem.cxx:177
unsigned int GB_UINT4
Definition: arbdb_base.h:37
CONSTEXPR_INLINE int leafs_2_nodes(int leafs, TreeModel model)
Definition: arbdbt.h:53
size_t GB_read_string_count(GBDATA *gbd)
Definition: arbdb.cxx:916
GB_ERROR GB_await_error()
Definition: arb_msg.cxx:342
Definition: arbdb.h:78
GB_ERROR save_aliEntry_to_SAI(const char *sai_name)
Definition: AP_pos_var.cxx:215
GB_ERROR delete_aliEntry_from_SAI(const char *sai_name)
Definition: AP_pos_var.cxx:202
void putfloat(float f)
Definition: arb_strbuf.h:241
static void error(const char *msg)
Definition: mkptypes.cxx:96
Definition: arbdb.h:86
GBDATA * GBT_find_sequence(GBDATA *gb_species, const char *aliname)
Definition: adali.cxx:708
AP_pos_var(GBDATA *gb_main_, const char *ali_name_, long ali_len_, bool is_nuc_, const char *tree_name_)
Definition: AP_pos_var.cxx:23
GB_ERROR retrieve(TreeNode *tree)
Definition: AP_pos_var.cxx:125
Definition: arbdb.h:72
AWT_translator * AWT_get_user_translator(GBDATA *gb_main)
bool is_leaf() const
Definition: TreeNode.h:211
TYPE * ARB_calloc(size_t nelem)
Definition: arb_mem.h:81
GB_ERROR GBT_write_string(GBDATA *gb_container, const char *fieldpath, const char *content)
Definition: adtools.cxx:451
GB_ERROR GB_write_ints(GBDATA *gbd, const GB_UINT4 *i, long size)
Definition: arbdb.cxx:1439
#define NULp
Definition: cxxforward.h:116
const char * get_data() const
Definition: arb_strbuf.h:120
GBDATA * GBT_find_or_create_SAI(GBDATA *gb_main, const char *name)
Definition: aditem.cxx:65
GB_CSTR GB_read_char_pntr(GBDATA *gbd)
Definition: arbdb.cxx:904
GBDATA * gb_node
Definition: TreeNode.h:173
GBDATA * gb_main
Definition: adname.cxx:32
GBDATA * GB_search(GBDATA *gbd, const char *fieldpath, GB_TYPES create)
Definition: adquery.cxx:531
unsigned char index2spro(int index) const
void inc_and_check_user_abort(GB_ERROR &error)
Definition: arb_progress.h:332
long index2bitset(int index) const
void put(char c)
Definition: arb_strbuf.h:174