ARB
chartable.h
Go to the documentation of this file.
1 // ================================================================= //
2 // //
3 // File : chartable.h //
4 // Purpose : count characters of multiple sequences //
5 // //
6 // Institute of Microbiology (Technical University Munich) //
7 // http://www.arb-home.de/ //
8 // //
9 // ================================================================= //
10 
11 #ifndef CHARTABLE_H
12 #define CHARTABLE_H
13 
14 #ifndef ARBTOOLS_H
15 #include <arbtools.h>
16 #endif
17 #ifndef POS_RANGE_H
18 #include <pos_range.h>
19 #endif
20 #ifndef ARBDB_BASE_H
21 #include <arbdb_base.h>
22 #endif
23 #ifndef _STDINT_H
24 #include <stdint.h>
25 #endif
26 
27 #define ct_assert(bed) arb_assert(bed)
28 
29 #ifdef DEBUG
30 // # define TEST_BASES_TABLE
31 #endif
32 
33 #if defined(DEBUG) && !defined(DEVEL_RELEASE)
34 # define TEST_CHAR_TABLE_INTEGRITY
35 #endif
36 
37 #define MAXCHARTABLE 256
38 #define SHORT_TABLE_ELEM_SIZE 1
39 #define SHORT_TABLE_MAX_VALUE 0xff
40 #define LONG_TABLE_ELEM_SIZE 4
41 
42 #define MAX_INDEX_TABLES 128 // max number of different non-ambiguous indices (into BaseFrequencies::bases_table)
43 #define MAX_TARGET_INDICES 4 // max number of non-ambiguous indices affected by one ambiguous code (4 for 'N' in RNA/DNA, 2 for amino-acids)
44 #define MAX_AMBIGUITY_CODES (1+4+6) // for RNA/DNA (3 for amino-acids)
45 
46 #define MAX_USED_BASES_TABLES 256
47 
48 // Note: short tables convert to long tables when adding 22 sequences (for nucleotides)
49 // * maybe remove short tables completely OR
50 // * increase values size in short tables to 16 bit
51 
52 namespace chartable {
53 
54  struct Ambiguity {
58  uint8_t indices; // number of entries in bases_table affected by this ambiguity
59  uint8_t increment; // how much each entry gets incremented
60  uint8_t index[MAX_TARGET_INDICES]; // into BaseFrequencies::bases_table
61  };
62 
63  class SepBaseFreq : virtual Noncopyable {
67  int table_entry_size; // how many bytes are used for each element of 'no_of_bases' (1 or 4 bytes)
68  union {
69  unsigned char *shortTable;
70  int *longTable;
71  } no_of_bases; // counts bases for each sequence position
72  int no_of_entries; // length of bases table
73 
74  int legal(int offset) const { return offset>=0 && offset<no_of_entries; }
75 
76  void set_elem_long(int offset, int value) {
77 #ifdef TEST_BASES_TABLE
78  ct_assert(legal(offset));
79  ct_assert(table_entry_size==LONG_TABLE_ELEM_SIZE);
80 #endif
81  no_of_bases.longTable[offset] = value;
82  }
83 
84  void set_elem_short(int offset, int value) {
85 #ifdef TEST_BASES_TABLE
86  ct_assert(legal(offset));
87  ct_assert(table_entry_size==SHORT_TABLE_ELEM_SIZE);
88  ct_assert(value>=0 && value<=SHORT_TABLE_MAX_VALUE);
89 #endif
90  no_of_bases.shortTable[offset] = value;
91  }
92 
93  int get_elem_long(int offset) const {
94 #ifdef TEST_BASES_TABLE
95  ct_assert(legal(offset));
96  ct_assert(table_entry_size==LONG_TABLE_ELEM_SIZE);
97 #endif
98  return no_of_bases.longTable[offset];
99  }
100 
101  int get_elem_short(int offset) const {
102 #ifdef TEST_BASES_TABLE
103  ct_assert(legal(offset));
104  ct_assert(table_entry_size==SHORT_TABLE_ELEM_SIZE);
105 #endif
106  return no_of_bases.shortTable[offset];
107  }
108 
109  public:
110 
111  SepBaseFreq(int maxseqlength);
112  ~SepBaseFreq();
113 
114  void init(int length);
115  int size() const { return no_of_entries; }
116 
117  int get_table_entry_size() const { return table_entry_size; }
119  bool bigger_table_entry_size_needed(int new_max_count) { return table_entry_size==SHORT_TABLE_ELEM_SIZE ? (new_max_count>SHORT_TABLE_MAX_VALUE) : 0; }
120 
121  int operator[](int offset) const { return table_entry_size==SHORT_TABLE_ELEM_SIZE ? get_elem_short(offset) : get_elem_long(offset); }
122 
123  void inc_short(int offset, int incr) {
124  int old = get_elem_short(offset);
125  ct_assert((old+incr)<=255);
126  set_elem_short(offset, old+incr);
127  }
128  void dec_short(int offset, int decr) {
129  int old = get_elem_short(offset);
130  ct_assert(old>=decr);
131  set_elem_short(offset, old-decr);
132  }
133  void inc_long(int offset, int incr) {
134  int old = get_elem_long(offset);
135  set_elem_long(offset, old+incr);
136  }
137  void dec_long(int offset, int decr) {
138  int old = get_elem_long(offset);
139  ct_assert(old>=decr);
140  set_elem_long(offset, old-decr);
141  }
142 
143  int firstDifference(const SepBaseFreq& other, int start, int end, int *firstDifferentPos) const;
144  int lastDifference(const SepBaseFreq& other, int start, int end, int *lastDifferentPos) const;
145 
146  void add(const SepBaseFreq& other, int start, int end);
147  void sub(const SepBaseFreq& other, int start, int end);
148  void sub_and_add(const SepBaseFreq& Sub, const SepBaseFreq& Add, PosRange range);
149 
150  void change_table_length(int new_length, int default_entry);
151 
152 
153 #if defined(TEST_CHAR_TABLE_INTEGRITY) || defined(ASSERTION_USED)
154  int empty() const;
155 #endif // TEST_CHAR_TABLE_INTEGRITY
156  };
157 
158 };
159 
161 
162 class BaseFrequencies : virtual Noncopyable {
171  typedef SepBaseFreq *SepBaseFreqPtr;
172 
173  SepBaseFreqPtr *bases_table;
174 
175  int sequenceUnits; // # of sequences added to the table (multiplied with 'units')
176  bool ignore; // this table will be ignored when calculating tables higher in hierarchy
177  // (used in EDIT4 to suppress SAIs in tables of ED4_root_group_manager)
178 
179  // @@@ move statics into own class:
180  static bool initialized;
181  static Ambiguity ambiguity_table[MAX_AMBIGUITY_CODES];
182  static uint8_t unitsPerSequence; // multiplier per added sequence (to allow proper distribution of ambiguity codes)
183  static unsigned char char_to_index_tab[MAXCHARTABLE]; // <MAX_INDEX_TABLES = > real index into 'bases_table', else index+MAX_INDEX_TABLES into ambiguity_table
184  static bool is_gap[MAXCHARTABLE];
185  static unsigned char upper_index_chars[MAX_USED_BASES_TABLES];
186  static int used_bases_tables; // size of 'bases_table'
187  static GB_alignment_type ali_type;
188 
189  static inline void set_char_to_index(unsigned char c, int index);
190 
191  void add(const BaseFrequencies& other, int start, int end);
192  void sub(const BaseFrequencies& other, int start, int end);
193 
194  void expand_tables();
195  int get_table_entry_size() const {
196  return linear_table(0).get_table_entry_size();
197  }
198  void prepare_to_add_elements(int new_sequences) {
199  ct_assert(used_bases_tables);
200  if (linear_table(0).bigger_table_entry_size_needed(sequenceUnits+new_sequences*unitsPerSequence)) {
201  expand_tables();
202  }
203  }
204 
205  // linear access to all tables
206  SepBaseFreq& linear_table(int c) { ct_assert(c<used_bases_tables); return *bases_table[c]; }
207  const SepBaseFreq& linear_table(int c) const { ct_assert(c<used_bases_tables); return *bases_table[c]; }
208 
209  // access via character
210  SepBaseFreq& table(int c) { ct_assert(c>0 && c<MAXCHARTABLE); return linear_table(char_to_index_tab[c]); }
211  const SepBaseFreq& table(int c) const { ct_assert(c>0 && c<MAXCHARTABLE); return linear_table(char_to_index_tab[c]); }
212 
213  static unsigned char index_to_upperChar(int index) { return upper_index_chars[index]; }
214 
215  void incr_short(unsigned char c, int offset);
216  void decr_short(unsigned char c, int offset);
217  void incr_long(unsigned char c, int offset);
218  void decr_long(unsigned char c, int offset);
219 
220 #if defined(TEST_CHAR_TABLE_INTEGRITY)
221  void test() const; // test if table is valid (dumps core if invalid)
222 #else
223  void test() const {}
224 #endif
225 
226 public:
227 
228 #if defined(TEST_CHAR_TABLE_INTEGRITY) || defined(ASSERTION_USED)
229  bool ok() const;
230  bool empty() const;
231 #endif
232 
233  BaseFrequencies(int maxseqlength=0);
235 
236  static void setup(const char *gap_chars, GB_alignment_type ali_type_);
237 
238  void ignore_me() { ignore = 1; }
239  int is_ignored() const { return ignore; }
240 
241  void init(int maxseqlength);
242  int size() const { return bases_table[0]->size(); }
243  int added_sequences() const { return sequenceUnits/unitsPerSequence; }
244 
245  void bases_and_gaps_at(int column, int *bases, int *gaps) const;
246  double max_frequency_at(int column, bool ignore_gaps) const;
247 
248  static bool isGap(char c) { return is_gap[safeCharIndex(c)]; }
249 
250  const PosRange *changed_range(const BaseFrequencies& other) const;
251  static const PosRange *changed_range(const char *string1, const char *string2, int min_len);
252 
253  void add(const BaseFrequencies& other);
254  void sub(const BaseFrequencies& other);
255  void sub_and_add(const BaseFrequencies& Sub, const BaseFrequencies& Add, PosRange range);
256 
257  void add(const char *string, int len);
258  void sub(const char *string, int len);
259  void sub_and_add(const char *old_string, const char *new_string, PosRange range);
260 
261  void build_consensus_string_to(char *consensus_string, ExplicitRange range, const ConsensusBuildParams& BK) const;
262  char *build_consensus_string(PosRange r, const ConsensusBuildParams& cbp) const;
264 
265  void change_table_length(int new_length);
266 };
267 
268 #else
269 #error chartable.h included twice
270 #endif // CHARTABLE_H
#define SHORT_TABLE_MAX_VALUE
Definition: chartable.h:39
void add(const SepBaseFreq &other, int start, int end)
Definition: chartable.cxx:80
char * build_consensus_string(const ConsensusBuildParams &cbp) const
Definition: chartable.h:263
BaseFrequencies(int maxseqlength=0)
Definition: chartable.cxx:715
CONSTEXPR_INLINE unsigned char safeCharIndex(char c)
Definition: dupstr.h:73
bool empty() const
Definition: chartable.cxx:1138
void change_table_length(int new_length, int default_entry)
Definition: chartable.cxx:304
int size() const
Definition: chartable.h:115
bool bigger_table_entry_size_needed(int new_max_count)
Definition: chartable.h:119
char * build_consensus_string(PosRange r, const ConsensusBuildParams &cbp) const
Definition: chartable.cxx:370
void init(int length)
Definition: chartable.cxx:37
void change_table_length(int new_length)
Definition: chartable.cxx:1126
int firstDifference(const SepBaseFreq &other, int start, int end, int *firstDifferentPos) const
Definition: chartable.cxx:192
double max_frequency_at(int column, bool ignore_gaps) const
Definition: chartable.cxx:763
void dec_short(int offset, int decr)
Definition: chartable.h:128
#define MAX_TARGET_INDICES
Definition: chartable.h:43
static PosRange whole()
Definition: pos_range.h:52
uint8_t index[MAX_TARGET_INDICES]
Definition: chartable.h:60
static HelixNrInfo * start
void inc_short(int offset, int incr)
Definition: chartable.h:123
static void setup(const char *gap_chars, GB_alignment_type ali_type_)
Definition: chartable.cxx:604
void init(int maxseqlength)
Definition: chartable.cxx:729
void sub_and_add(const BaseFrequencies &Sub, const BaseFrequencies &Add, PosRange range)
Definition: chartable.cxx:899
void inc_long(int offset, int incr)
Definition: chartable.h:133
static bool isGap(char c)
Definition: chartable.h:248
unsigned char * shortTable
Definition: chartable.h:69
const PosRange * changed_range(const BaseFrequencies &other) const
Definition: chartable.cxx:821
SepBaseFreq(int maxseqlength)
Definition: chartable.cxx:288
#define MAXCHARTABLE
Definition: chartable.h:37
void bases_and_gaps_at(int column, int *bases, int *gaps) const
Definition: chartable.cxx:738
#define ct_assert(bed)
Definition: chartable.h:27
void dec_long(int offset, int decr)
Definition: chartable.h:137
#define LONG_TABLE_ELEM_SIZE
Definition: chartable.h:40
void ignore_me()
Definition: chartable.h:238
GB_alignment_type
Definition: arbdb_base.h:61
int size() const
Definition: chartable.h:242
void expand_table_entry_size()
Definition: chartable.cxx:58
#define SHORT_TABLE_ELEM_SIZE
Definition: chartable.h:38
int lastDifference(const SepBaseFreq &other, int start, int end, int *lastDifferentPos) const
Definition: chartable.cxx:239
void sub(const SepBaseFreq &other, int start, int end)
Definition: chartable.cxx:110
int get_table_entry_size() const
Definition: chartable.h:117
#define MAX_USED_BASES_TABLES
Definition: chartable.h:46
int operator[](int offset) const
Definition: chartable.h:121
bool ok() const
Definition: chartable.cxx:1148
void sub_and_add(const SepBaseFreq &Sub, const SepBaseFreq &Add, PosRange range)
Definition: chartable.cxx:140
#define offset(field)
Definition: GLwDrawA.c:73
void build_consensus_string_to(char *consensus_string, ExplicitRange range, const ConsensusBuildParams &BK) const
Definition: chartable.cxx:386
int is_ignored() const
Definition: chartable.h:239
#define MAX_AMBIGUITY_CODES
Definition: chartable.h:44
size_t length
int added_sequences() const
Definition: chartable.h:243