ARB
PT_compress.h
Go to the documentation of this file.
1 // ================================================================ //
2 // //
3 // File : PT_compress.h //
4 // Purpose : //
5 // //
6 // Coded by Ralf Westram (coder@reallysoft.de) in November 2012 //
7 // Institute of Microbiology (Technical University Munich) //
8 // http://www.arb-home.de/ //
9 // //
10 // ================================================================ //
11 
12 #ifndef PT_COMPRESS_H
13 #define PT_COMPRESS_H
14 
15 inline size_t count_uint_32(uint32_t *seq, size_t seqsize, uint32_t cmp) {
16  size_t count = 0;
17  while (count<seqsize && seq[count] == cmp) count++;
18  return count*4;
19 }
20 
21 inline size_t count_char(const unsigned char *seq, size_t seqsize, unsigned char c, uint32_t c4) {
22  if (seqsize>0 && seq[0] == c) {
23  size_t count = 1+count_uint_32((uint32_t*)(seq+1), (seqsize-1)/4, c4);
24  for (; count<seqsize && seq[count] == c; ++count) ;
25  return count;
26  }
27  return 0;
28 }
29 
30 inline size_t count_dots(const unsigned char *seq, int seqsize) { return count_char(seq, seqsize, '.', 0x2E2E2E2E); }
31 inline size_t count_gaps(const unsigned char *seq, int seqsize) { return count_char(seq, seqsize, '-', 0x2D2D2D2D); }
32 
33 inline size_t count_gaps_and_dots(const unsigned char *seq, int seqsize) {
34  pt_assert(seqsize>0);
35 
36  size_t count = 0;
37  size_t count2;
38  size_t count3;
39 
40  do {
41  count2 = count_dots(seq+count, seqsize-count);
42  count += count2;
43  count3 = count_gaps(seq+count, seqsize-count);
44  count += count3;
45  }
46  while (count2 || count3);
47  return count;
48 }
49 
50 // uncomment next line to count all bases compressed by ptserver and dump them when program terminates
51 // #define COUNT_COMPRESSES_BASES
52 #if defined(COUNT_COMPRESSES_BASES)
53 class BaseCounter {
54  long count[PT_BASES];
55 public:
56  BaseCounter() {
57  for (int i = 0; i<PT_BASES; ++i) {
58  count[i] = 0;
59  }
60  }
61  ~BaseCounter() {
62  fflush_all();
63  fputs("\nBaseCounter:\n", stderr);
64  for (int i = 0; i<PT_BASES; ++i) {
65  fprintf(stderr, "count[%i]=%li\n", i, count[i]);
66  }
67  }
68 
69  void inc(uchar base) {
70  pt_assert(base<PT_BASES);
71  ++count[base];
72  }
73 };
74 #endif
75 
76 typedef unsigned char uchar;
77 
78 class PT_compressed : virtual Noncopyable {
79  size_t max_input_length;
80  uchar *compressed;
81  unsigned *base_offset; // to previous base
82  size_t size;
83 
84  static bool translation_initialized;
85  static uchar translate[256];
86 
87 #if defined(COUNT_COMPRESSES_BASES)
88  static BaseCounter base_counter;
89 #endif
90 
91  static void initTranslation() {
92  memset(translate, PT_N, 256);
93 
94  translate['A'] = translate['a'] = PT_A;
95  translate['C'] = translate['c'] = PT_C;
96  translate['G'] = translate['g'] = PT_G;
97  translate['T'] = translate['t'] = PT_T;
98  translate['U'] = translate['u'] = PT_T;
99  translate['.'] = PT_QU;
100 
101  translate[0] = PT_B_UNDEF;
102 
103  translation_initialized = true;
104  }
105 
106 public:
107  PT_compressed(size_t max_input_length_)
108  : max_input_length(max_input_length_),
109  compressed(new uchar[max_input_length+1]),
110  base_offset(new unsigned[max_input_length+1]),
111  size(0)
112  {
113  if (!translation_initialized) initTranslation();
114  }
116  delete [] compressed;
117  delete [] base_offset;
118  }
119 
120  void createFrom(const unsigned char * const seq, const size_t length) {
121  pt_assert(length <= max_input_length);
122 
123  size_t base_count = 0;
124  size_t last_base_offset = 0;
125 
126  uchar c = PT_N; // just not PT_QU
127  for (size_t offset = 0; offset<length; ++offset) {
128  if (c == PT_QU) {
129  offset += count_gaps_and_dots(seq + offset, length - offset); // skip over gaps and dots (ignoring multiple dots)
130  }
131  else {
132  offset += count_gaps(seq + offset, length - offset); // skip over gaps only (stops at next dot)
133  }
134 
135  if (offset >= length) break;
136 
137  c = translate[seq[offset]];
138  pt_assert(c != PT_B_UNDEF); // e.g. hit end of sequence
139 
140 #if defined(COUNT_COMPRESSES_BASES)
141  base_counter.inc(c);
142 #endif
143  compressed[base_count] = c;
144  base_offset[base_count++] = offset-last_base_offset;
145  last_base_offset = offset;
146  }
147 
148  if (base_count <= 0) { // may happen if empty (or gap-only) input sequence is passed
149  size = 0;
150  return;
151  }
152 
153  // ensure last entry is a dot
154  if (compressed[base_count-1] != PT_QU) {
155 #if defined(COUNT_COMPRESSES_BASES)
156  base_counter.inc(PT_QU);
157 #endif
158  compressed[base_count] = PT_QU;
159  base_offset[base_count] = 1; // one position behind prev base
160  ++base_count;
161  }
162 
163  // if first entry is a dot, reposition it before 1st non-dot-base
164  if (compressed[0] == PT_QU && base_count>1) {
165  pt_assert(compressed[1] != PT_QU); // many dots should always compress into 1 dot
166  base_offset[0] = base_offset[1]-1;
167  base_offset[1] = 1;
168  }
169 
170  size = base_count;
171  pt_assert(size <= (length+1));
172 #ifdef ARB_64
173  pt_assert(!(size & 0xffffffff00000000)); // must fit into 32 bit
174 #endif
175 
176  }
177  void createFrom(const char * const seq, const size_t length) {
178  createFrom(reinterpret_cast<const unsigned char *>(seq), length);
179  }
180 
181  size_t get_size() const { return size; }
182  size_t get_allowed_size() const { return max_input_length; }
183  const char *get_seq() const { return reinterpret_cast<const char *>(compressed); }
184  const unsigned *get_offsets() const { return base_offset; }
185 };
186 
187 #else
188 #error PT_compress.h included twice
189 #endif // PT_COMPRESS_H
Definition: probe.h:83
void fflush_all()
Definition: PT_tools.h:211
Definition: probe.h:84
unsigned char uchar
Definition: PT_compress.h:76
size_t count_uint_32(uint32_t *seq, size_t seqsize, uint32_t cmp)
Definition: PT_compress.h:15
const unsigned * get_offsets() const
Definition: PT_compress.h:184
Definition: probe.h:85
FILE * seq
Definition: rns.c:46
size_t get_allowed_size() const
Definition: PT_compress.h:182
const char * get_seq() const
Definition: PT_compress.h:183
size_t count_char(const unsigned char *seq, size_t seqsize, unsigned char c, uint32_t c4)
Definition: PT_compress.h:21
void createFrom(const unsigned char *const seq, const size_t length)
Definition: PT_compress.h:120
void createFrom(const char *const seq, const size_t length)
Definition: PT_compress.h:177
BaseCounter(const std::string &Source)
PT_compressed(size_t max_input_length_)
Definition: PT_compress.h:107
#define cmp(h1, h2)
Definition: admap.cxx:50
Definition: probe.h:86
Definition: probe.h:88
#define pt_assert(bed)
Definition: PT_tools.h:22
fputs(TRACE_PREFIX, stderr)
size_t count_dots(const unsigned char *seq, int seqsize)
Definition: PT_compress.h:30
Definition: probe.h:89
size_t get_size() const
Definition: PT_compress.h:181
unsigned char uchar
Definition: gde.hxx:21
#define offset(field)
Definition: GLwDrawA.c:73
size_t count_gaps(const unsigned char *seq, int seqsize)
Definition: PT_compress.h:31
size_t length
size_t count_gaps_and_dots(const unsigned char *seq, int seqsize)
Definition: PT_compress.h:33
Definition: probe.h:87