ARB
ali_profile.hxx
Go to the documentation of this file.
1 // =============================================================== //
2 // //
3 // File : ali_profile.hxx //
4 // Purpose : //
5 // //
6 // Institute of Microbiology (Technical University Munich) //
7 // http://www.arb-home.de/ //
8 // //
9 // =============================================================== //
10 
11 #ifndef ALI_PROFILE_HXX
12 #define ALI_PROFILE_HXX
13 
14 #ifndef ALI_ARBDB_HXX
15 #include "ali_arbdb.hxx"
16 #endif
17 #ifndef ALI_PT_HXX
18 #include "ali_pt.hxx"
19 #endif
20 
21 #define ALI_PROFILE_BORDER_LEFT '['
22 #define ALI_PROFILE_BORDER_RIGHT ']'
23 
24 typedef void ALI_MAP_; // make module independent
25 
26 
30 
32 
36 
39 
40  float min_weight;
42 
44 
47 
48  double substitute_matrix[5][5]; // a c g u -
49  double binding_matrix[5][5]; // a c g u -
50 };
51 
52 
53 // Class for a family member
56  float matches;
57  float weight;
58 
59  ali_family_member(ALI_SEQUENCE *seq, float m, float w = 0.0) {
60  sequence = seq;
61  matches = m;
62  weight = w;
63  }
64 };
65 
66 
67 // Class for the profiling
68 class ALI_PROFILE : virtual Noncopyable {
69  ALI_NORM_SEQUENCE *norm_sequence;
70  unsigned long prof_len;
71 
72  long **helix; // base to base connection
73  char **helix_borders; // borders of the helix '[' and ']'
74  unsigned long helix_len;
75 
76  float (**base_weights)[4]; // relative weight of base i
77  float (**sub_costs)[6]; // costs to substitute with base i
78  float sub_costs_maximum;
79 
80  float insert_cost;
81  float multi_insert_cost;
82 
83  float multi_gap_factor;
84 
85  float (*binding_costs)[5][5]; // Matrix for binding costs a c g u -
86  float w_bind_maximum;
87 
88  unsigned long *lmin, *lmax;
89  float ***gap_costs;
90  float ***gap_percents;
91 
92  int is_binding_marker(char c);
93 
95  ALI_PROFILE_CONTEXT *context);
96  void calculate_costs(ALI_TLIST<ali_family_member *> *family_list,
97  ALI_PROFILE_CONTEXT *context);
98 
99  int find_next_helix(char h[], unsigned long h_len, unsigned long pos,
100  unsigned long *helix_nr,
101  unsigned long *start, unsigned long *end);
102  int find_comp_helix(char h[], unsigned long h_len, unsigned long pos,
103  unsigned long helix_nr,
104  unsigned long *start, unsigned long *end);
105  void delete_comp_helix(char h1[], char h2[], unsigned long h_len,
106  unsigned long start, unsigned long end);
107  // int map_helix(char h[], unsigned long h_len,
108  // unsigned long start1, unsigned long end1,
109  // unsigned long start2, unsigned long end2);
110  void initialize_helix(ALI_PROFILE_CONTEXT *context);
111 
112 public:
113 
114  ALI_PROFILE(ALI_SEQUENCE *sequence, ALI_PROFILE_CONTEXT *context);
115  ~ALI_PROFILE();
116 
117  void print(int start = -1, int end = -1) {
118  int i;
119  size_t j;
120 
121  if (start < 0 || (unsigned long)(start) > prof_len - 1)
122  start = 0;
123 
124  if ((unsigned long)(end) > prof_len - 1 || end < 0)
125  end = (int)prof_len - 1;
126 
127  if (start > end) {
128  i = start;
129  start = end;
130  end = i;
131  }
132 
133 
134  printf("\nProfile from %d to %d\n", start, end);
135  printf("Substitutions kosten:\n");
136  for (i = start; i <= end; i++) {
137  printf("%2d: ", i);
138  for (j = 0; j < 6; j++)
139  printf("%4.2f ", (*sub_costs)[i][j]);
140  printf("\n");
141  }
142 
143  printf("\nGap Bereiche:\n");
144  for (i = start; i <= end; i++) {
145  printf("%2d: %3ld %3ld\n", i, lmin[i], lmax[i]);
146  printf(" : ");
147  for (j = 0; j <= lmax[i] - lmin[i] + 1; j++)
148  printf("%4.2f ", (*gap_percents)[i][j]);
149  printf("\n : ");
150  for (j = 0; j <= lmax[i] - lmin[i] + 1; j++)
151  printf("%4.2f ", (*gap_costs)[i][j]);
152  printf("\n");
153  }
154 
155  }
156 
157  int is_in_helix(unsigned long pos,
158  unsigned long *first, unsigned long *last);
159  int is_outside_helix(unsigned long pos,
160  unsigned long *first, unsigned long *last);
161  int complement_position(unsigned long pos) {
162  if (pos >= helix_len)
163  return -1;
164  else
165  return (*helix)[pos];
166  }
167  int is_in_helix(unsigned long pos) {
168  long l;
169  if ((*helix_borders)[pos] == ALI_PROFILE_BORDER_LEFT ||
170  (*helix_borders)[pos] == ALI_PROFILE_BORDER_RIGHT)
171  return 1;
172  for (l = (long) pos - 1; l >= 0; l--)
173  switch ((*helix_borders)[l]) {
175  return 1;
177  return 0;
178  }
179  for (l = (long) pos + 1; l < (long) prof_len; l++)
180  switch ((*helix_borders)[l]) {
182  return 0;
184  return 1;
185  }
186  return 0;
187  }
188 
189  int is_internal_last(unsigned long pos) {
190  return norm_sequence->is_begin(pos + 1);
191  }
192 
193  char *cheapest_sequence();
195  unsigned long i;
196  char *str, *str_buffer;
197 
198  str = (char *) calloc((unsigned int) prof_len, sizeof(char));
199  ali_out_of_memory_if(!str);
200 
201  str_buffer = str;
202  for (i = 0; i < prof_len; i++)
203  switch ((*helix_borders)[i]) {
204  case ALI_PROFILE_BORDER_LEFT: *str++ = '['; break;
205  case ALI_PROFILE_BORDER_RIGHT: *str++ = ']'; break;
206  default: *str++ = ' ';
207  }
208  return str_buffer;
209  }
210 
211  unsigned long length() {
212  return prof_len;
213  }
215  return norm_sequence;
216  }
217  unsigned long sequence_length() {
218  return norm_sequence->length();
219  }
220 
221  float base_weight(unsigned long pos, unsigned char c) {
222  if (c > 3)
223  ali_fatal_error("Out of range", "ALI_PROFILE::base_weight()");
224  return (*base_weights)[pos][c];
225  }
226 
227  float w_sub(unsigned long positiony, unsigned long positionx) {
228  return (*sub_costs)[positiony][norm_sequence->base(positionx)];
229  }
230  float w_sub_gap(unsigned long positiony) {
231  return (*sub_costs)[positiony][4];
232  }
233  float w_sub_gap_cheap(unsigned long positiony) {
234  return (*sub_costs)[positiony][4] * multi_gap_factor;
235  }
236  float w_sub_multi_gap_cheap(unsigned long start, unsigned long end) {
237  float costs = 0.0;
238  unsigned long i;
239  for (i = start; i <= end; i++)
240  costs += w_sub_gap(i);
241  return costs * multi_gap_factor;
242  }
243 
244  float w_sub_maximum() {
245  return sub_costs_maximum;
246  }
247 
248  float w_del(unsigned long begin, unsigned long end) {
249  if (begin <= lmin[end])
250  return (*gap_costs)[end][0];
251  else {
252  if (begin <= lmax[end])
253  return (*gap_costs)[end][begin - lmin[end]];
254  else
255  return (*gap_costs)[end][lmax[end] - lmin[end] + 1];
256  }
257  }
258  float w_del_cheap(unsigned long position) {
259  return (*gap_costs)[position][0];
260  }
261  float w_del_multi(unsigned long start, unsigned long end) {
262  float costs = 0.0;
263  unsigned long i;
264  for (i = start; i <= end; i++)
265  costs += w_del(start, i);
266  return costs;
267  }
268  float w_del_multi_cheap(unsigned long start, unsigned long end) {
269  float costs = 0.0;
270  unsigned long i;
271  for (i = start; i <= end; i++)
272  costs += w_del_cheap(i);
273  return costs;
274  }
275  float w_del_multi_unweighted(unsigned long start, unsigned long end) {
276  float costs = 0.0;
277  unsigned long i;
278  for (i = start; i <= end; i++)
279  costs += w_del(i, i);
280  return costs;
281  }
282 
283  float w_ins(unsigned long /* positionx */, unsigned long /* positiony */) {
284  return insert_cost;
285  }
286  float w_ins_cheap(unsigned long /* positionx */, unsigned long /* positiony */) {
287  return multi_insert_cost;
288  }
289  float w_ins_multi_unweighted(unsigned long startx, unsigned long endx) {
290  return (endx - startx + 1) * insert_cost;
291  }
292  float w_ins_multi(unsigned long startx, unsigned long endx) {
293  return insert_cost + (endx - startx)*multi_insert_cost;
294  }
295  float w_ins_multi_cheap(unsigned long startx, unsigned long endx) {
296  return (endx - startx + 1)*multi_insert_cost;
297  }
298 
299 
300  float gap_percent(unsigned long begin, unsigned long end) {
301  if (begin <= lmin[end])
302  return (*gap_percents)[end][0];
303  else {
304  if (begin <= lmax[end])
305  return (*gap_percents)[end][begin - lmin[end]];
306  else
307  return (*gap_percents)[end][lmax[end] - lmin[end] + 1];
308  }
309  }
310 
311  float w_bind(unsigned long pos_1, unsigned char base_1,
312  unsigned long pos_2, unsigned char base_2) {
313  int i, j;
314  float cost_in, cost = 0;
315  if (ali_is_real_base_or_gap(base_1)) {
316  if (ali_is_real_base_or_gap(base_2)) {
317  return (*binding_costs)[base_1][base_2];
318  }
319  for (i = 0; i < 4; i++)
320  cost += (*base_weights)[pos_2][i] * (*binding_costs)[base_1][i];
321  return cost;
322  }
323  if (ali_is_real_base_or_gap(base_2)) {
324  for (i = 0; i < 4; i++)
325  cost += (*base_weights)[pos_1][i] * (*binding_costs)[i][base_2];
326  return cost;
327  }
328  for (i = 0; i < 4; i++) {
329  for (j = 0, cost_in = 0; j < 4; j++)
330  cost_in += (*base_weights)[pos_2][j] * (*binding_costs)[i][j];
331  cost += (*base_weights)[pos_1][i] * cost_in;
332  }
333  return cost;
334  }
335 
336  float w_binding(unsigned long first_position_of_sequence,
337  ALI_SEQUENCE *sequence);
338 };
339 
340 #else
341 #error ali_profile.hxx included twice
342 #endif // ALI_PROFILE_HXX
float w_bind(unsigned long pos_1, unsigned char base_1, unsigned long pos_2, unsigned char base_2)
float w_ins_multi(unsigned long startx, unsigned long endx)
void ali_out_of_memory_if(bool cond)
Definition: ali_misc.hxx:52
ALI_PROFILE(ALI_SEQUENCE *sequence, ALI_PROFILE_CONTEXT *context)
float base_weight(unsigned long pos, unsigned char c)
long
Definition: AW_awar.cxx:152
float w_sub_gap_cheap(unsigned long positiony)
float w_sub_multi_gap_cheap(unsigned long start, unsigned long end)
float w_binding(unsigned long first_position_of_sequence, ALI_SEQUENCE *sequence)
int is_in_helix(unsigned long pos, unsigned long *first, unsigned long *last)
float w_ins(unsigned long, unsigned long)
float w_del(unsigned long begin, unsigned long end)
float w_sub_maximum()
unsigned long length()
#define ALI_PROFILE_BORDER_RIGHT
Definition: ali_profile.hxx:22
FILE * seq
Definition: rns.c:46
float w_del_multi_cheap(unsigned long start, unsigned long end)
static HelixNrInfo * start
char * cheapest_sequence()
int ali_is_real_base_or_gap(char c)
Definition: ali_misc.hxx:86
ALI_NORM_SEQUENCE * sequence()
int complement_position(unsigned long pos)
unsigned char base(unsigned long position)
float w_ins_cheap(unsigned long, unsigned long)
ALI_SEQUENCE * sequence
Definition: ali_profile.hxx:55
void ALI_MAP_
Definition: ali_profile.hxx:24
char * str
Definition: defines.h:20
int is_internal_last(unsigned long pos)
float w_del_cheap(unsigned long position)
float gap_percent(unsigned long begin, unsigned long end)
float w_del_multi_unweighted(unsigned long start, unsigned long end)
void ali_fatal_error(const char *message, const char *func)
Definition: ali_main.cxx:85
float w_ins_multi_unweighted(unsigned long startx, unsigned long endx)
float w_sub(unsigned long positiony, unsigned long positionx)
unsigned long sequence_length()
float w_ins_multi_cheap(unsigned long startx, unsigned long endx)
int is_in_helix(unsigned long pos)
int is_outside_helix(unsigned long pos, unsigned long *first, unsigned long *last)
char * borders_sequence()
int is_begin(unsigned long pos)
double binding_matrix[5][5]
Definition: ali_profile.hxx:49
void print(int start=-1, int end=-1)
ali_family_member(ALI_SEQUENCE *seq, float m, float w=0.0)
Definition: ali_profile.hxx:59
float w_sub_gap(unsigned long positiony)
unsigned long length()
float w_del_multi(unsigned long start, unsigned long end)
double substitute_matrix[5][5]
Definition: ali_profile.hxx:48
#define ALI_PROFILE_BORDER_LEFT
Definition: ali_profile.hxx:21