ARB
seq_search.cxx
Go to the documentation of this file.
1 // =============================================================== //
2 // //
3 // File : seq_search.cxx //
4 // Purpose : //
5 // //
6 // Coded by Ralf Westram (coder@reallysoft.de) //
7 // Institute of Microbiology (Technical University Munich) //
8 // http://www.arb-home.de/ //
9 // //
10 // =============================================================== //
11 
12 #include "seq_search.hxx"
13 
14 #include <arbdbt.h>
15 #include <cstdarg>
16 #include <climits>
17 #include <cctype>
18 
19 #define MESSAGE_BUFFERSIZE 300
20 
21 #if defined(DEBUG)
22 inline int check_equal(int o, int n) {
23  if (o != n) {
24  fprintf(stderr, "o=%i\nn=%i\n", o, n);
25  fflush(stderr);
26  }
27  fa_assert(o == n);
28  return n;
29 }
30 #endif
31 
32 __ATTR__FORMAT(1) static void messagef(const char *format, ...) {
33  va_list argp;
35 
36  va_start(argp, format);
37 
38  IF_ASSERTION_USED(int chars =) vsprintf(buffer, format, argp);
40 
41  va_end(argp);
42  GB_warning(buffer);
43 }
44 
45 void Dots::append(Dots *neu) {
46  if (Next) Next->append(neu);
47  else Next = neu;
48 }
49 
51 
52 CompactedSequence::CompactedSequence(const char *Text, int Length, const char *name, int start_offset)
53  : basepos(Text, Length, pred_is_ali_gap),
54  myName(strdup(name)),
55  myStartOffset(0), // corrected at end of ctor (otherwise calculations below go wrong)
56  dots(NULp)
57  // referred(1)
58 {
59  fa_assert(Length>0);
60 
61  {
62  long firstBase = basepos.first_base_abspos();
63  long lastBase = basepos.last_base_abspos();
64  int dotOffset = firstBase + strcspn(Text+firstBase, "."); // skip non-dots
65 
66  while (dotOffset <= lastBase) {
67  fa_assert(Text[dotOffset] == '.');
68  storeDots(basepos.abs_2_rel(dotOffset));
69  dotOffset += strspn(Text+dotOffset, "."); // skip dots
70  dotOffset += strcspn(Text+dotOffset, "."); // skip non-dots
71  }
72  }
73 
74  int cLen = length();
75  myText = new char[cLen+1];
76  myText[cLen] = 0;
77 
78  gapsBeforePosition = new int[cLen+1];
79 
80  for (int cPos = 0; cPos<cLen; ++cPos) {
81  int xPos = basepos.rel_2_abs(cPos);
82  myText[cPos] = toupper(Text[xPos]);
83  gapsBeforePosition[cPos] = no_of_gaps_before(cPos);
84  }
85  if (cLen>0) {
86  gapsBeforePosition[cLen] = no_of_gaps_before(cLen); // gaps before end of sequence
87  }
88 
89  myStartOffset += start_offset;
90 }
91 
93  delete [] myText;
94  delete[] gapsBeforePosition;
95  free(myName);
96  delete dots;
97 }
98 
100  if (myNext) delete myNext;
101 }
102 
104  memset((char*)myOffset, 0, MAX_TRIPLES*sizeof(*myOffset));
105  SequencePosition triple(seq);
106 
107  mySequence = &seq;
108 
109  while (triple.rightOf()>=3) { // enough text for triple?
110  int tidx = triple_index(triple.text());
111  TripleOffset *top = new TripleOffset(triple.leftOf(), myOffset[tidx]);
112 
113  myOffset[tidx] = top;
114  ++triple;
115  }
116 }
117 
119  for (int tidx = 0; tidx<MAX_TRIPLES; ++tidx) {
120  delete myOffset[tidx];
121  }
122 }
123 
125  long off = 0;
126  long rest = used;
127 
128  while (rest) {
129  const char *qual = quality();
130  char *found = (char*)memchr(qual+off, '?', rest); // search for next '?' in myQuality
131 
132  if (!found || (found-qual) >= rest) break;
133 
134  long cnt;
135  for (cnt=0; found[cnt]=='?'; cnt++) found[cnt] = '+'; // count # of unaligned positions and change them to '+'
136 
137  long from = found-myQuality;
138  long b_off = from-1; // position before unaligned positions
139  long a_off = from+cnt; // positions after unaligned positions
140 
141  long before, after;
142  for (before=0; b_off>=0 && isGlobalGap(b_off); before++, b_off--) ; // count free positions before unaligned positions
143  for (after=0; a_off<used && isGlobalGap(a_off); after++, a_off++) ; // count free positions after unaligned positions
144 
145  if (b_off<=0) before = LONG_MAX;
146  if (a_off>=used) after = LONG_MAX;
147 
148  if (before<after) { // move to the left
149  if (before) {
150  long to = from-before;
151  while (cnt--) moveUnaligned(from++, to++);
152  }
153  }
154  else if (after!=LONG_MAX) { // move to the right
155  if (after) {
156  from += cnt-1; // copy from right to left!
157  long to = from+after;
158 
159  while (cnt--) moveUnaligned(from--, to--);
160  }
161  }
162 
163  rest -= (a_off-off);
164  off = a_off;
165  }
166 }
167 
169  long rest = used;
170  long off = 0; // real position in sequence
171  long count = 0; // # of bases left of this position
172  long nextDot = slaveSequence.firstDotPosition();
173 
174  while (nextDot!=-1 && rest) {
175  unsigned char gap = myBuffer[off];
176  if (is_gap(gap)) {
177  if (nextDot==count) {
178  myBuffer[off] = '.';
179  }
180  }
181  else {
182  count++;
183  if (nextDot==count) {
184  messagef("Couldn't restore dots at offset %li of '%s' (gap removed by aligner)",
185  off, slaveSequence.name());
186  }
187  if (nextDot<count) {
188  nextDot = slaveSequence.nextDotPosition();
189  }
190  }
191 
192  off++;
193  rest--;
194  }
195 }
196 
198  for (int i=0; is_gap(myBuffer[i]) && i<used; ++i) myBuffer[i] = '.';
199  for (int i=used-1; is_gap(myBuffer[i]) && i>=0; --i) myBuffer[i] = '.';
200 }
201 
202 // --------------------------------------------------------------------------------
203 
204 #ifdef UNIT_TESTS
205 #include <test_unit.h>
206 #include <test_helpers.h>
207 
208 static int get_dotpos(const CompactedSubSequence& css, int i) {
209  int pos = css.firstDotPosition();
210  while (i) {
211  --i;
212  pos = css.nextDotPosition();
213  }
214  return pos;
215 }
216 static int count_dotpos(const CompactedSubSequence& css) {
217  int pos = css.firstDotPosition();
218  int count = 0;
219  while (pos != -1) {
220  ++count;
221  pos = css.nextDotPosition();
222  }
223  return count;
224 }
225 
226 struct bind_css {
228  int test_mode;
229  bind_css(CompactedSubSequence& css_) : css(css_), test_mode(-1) {}
230 
231  int operator()(int i) const {
232  switch (test_mode) {
233  case 0: return css.compPosition(i);
234  case 1: return css.expdPosition(i);
235  case 2: return css[i];
236  case 3: return css.no_of_gaps_before(i);
237  case 4: return css.no_of_gaps_after(i);
238  case 5: return get_dotpos(css, i);
239  }
240  fa_assert(0);
241  return -666;
242  }
243 };
244 
245 #define TEST_EXPECT_CSS_SELF_REFLEXIVE(css) do { \
246  for (int b = 0; b<css.length(); ++b) { \
247  int x = css.expdPosition(b); \
248  int c = css.compPosition(x); \
249  TEST_EXPECT_EQUAL(c,b); \
250  } \
251  } while(0)
252 
253 #define CSS_COMMON(in,offset) \
254  int len = strlen(in); \
255  fprintf(stderr, "in='%s'\n", in); \
256  CompactedSubSequence css(in, len, "noname", offset); \
257  TEST_EXPECT_CSS_SELF_REFLEXIVE(css); \
258  bind_css bound_css(css)
259 
260 #define GEN_COMP_EXPD() \
261  bound_css.test_mode = 0; \
262  char *comp = collectIntFunResults(bound_css, 0, css.expdLength()-1, 3, 0, 1); \
263  bound_css.test_mode = 1; \
264  char *expd = collectIntFunResults(bound_css, 0, css.length(), 3, 0, 0)
265 
266 #define GEN_TEXT(in) \
267  bound_css.test_mode = 2; \
268  char *text = collectIntFunResults(bound_css, 0, css.length()-1, 3, 0, 0)
269 
270 #define GEN_GAPS() \
271  bound_css.test_mode = 3; \
272  char *gaps_before = collectIntFunResults(bound_css, 0, css.length(), 3, 0, 0); \
273  bound_css.test_mode = 4; \
274  char *gaps_after = collectIntFunResults(bound_css, 0, css.length()-1, 3, 0, 1)
275 
276 #define GEN_DOTS(in) \
277  bound_css.test_mode = 5; \
278  char *dots = collectIntFunResults(bound_css, 0, count_dotpos(css)-1, 3, 0, 1)
279 
280 #define FREE_COMP_EXPD() \
281  free(expd); \
282  free(comp)
283 
284 #define FREE_GAPS() \
285  free(gaps_before); \
286  free(gaps_after)
287 
288 #define COMP_EXPD_CHECK(exp_comp,exp_expd) \
289  GEN_COMP_EXPD(); \
290  TEST_EXPECT_EQUAL(comp, exp_comp); \
291  TEST_EXPECT_EQUAL(expd, exp_expd); \
292  FREE_COMP_EXPD()
293 
294 #define GAPS_CHECK(exp_before,exp_after) \
295  GEN_GAPS(); \
296  TEST_EXPECT_EQUAL(gaps_before, exp_before); \
297  TEST_EXPECT_EQUAL(gaps_after, exp_after); \
298  FREE_GAPS()
299 
300 #define DOTS_CHECK(exp_dots) \
301  GEN_DOTS(); \
302  TEST_EXPECT_EQUAL(dots, exp_dots); \
303  free(dots)
304 
305 // ------------------------------------------------------------
306 
307 #define TEST_CS_EQUALS(in,exp_comp,exp_expd) do { \
308  CSS_COMMON(in, 0); \
309  COMP_EXPD_CHECK(exp_comp,exp_expd); \
310  } while(0)
311 
312 #define TEST_CS_EQUALS_OFFSET(in,offset,exp_comp,exp_expd) do { \
313  CSS_COMMON(in, offset); \
314  COMP_EXPD_CHECK(exp_comp,exp_expd); \
315  } while(0)
316 
317 #define TEST_GAPS_EQUALS_OFFSET(in,offset,exp_before,exp_after) do { \
318  CSS_COMMON(in, offset); \
319  GAPS_CHECK(exp_before,exp_after); \
320  } while(0)
321 
322 #define TEST_DOTS_EQUALS_OFFSET(in,offset,exp_dots) do { \
323  CSS_COMMON(in, offset); \
324  DOTS_CHECK(exp_dots); \
325  } while(0)
326 
327 #define TEST_CS_TEXT(in,exp_text) do { \
328  CSS_COMMON(in, 0); \
329  GEN_TEXT(in); \
330  TEST_EXPECT_EQUAL(text, exp_text); \
331  free(text); \
332  } while(0)
333 
334 #define TEST_CS_CBROKN(in,exp_comp,exp_expd) do { \
335  CSS_COMMON(in, 0); \
336  GEN_COMP_EXPD(); \
337  TEST_EXPECT_EQUAL__BROKEN(comp, exp_comp); \
338  TEST_EXPECT_EQUAL(expd, exp_expd); \
339  FREE_COMP_EXPD(); \
340  } while(0)
341 
342 __ATTR__REDUCED_OPTIMIZE void TEST_EARLY_CompactedSequence() {
343  // reproduce a bug in compPosition
344 
345  // no base
346  TEST_CS_EQUALS("-", " 0 [0]", " 1");
347  TEST_CS_EQUALS("--", " 0 0 [0]", " 2");
348  TEST_CS_EQUALS("---", " 0 0 0 [0]", " 3");
349 
350  TEST_CS_TEXT("---", "");
351  TEST_CS_TEXT("-?~", "");
352  TEST_CS_TEXT("-A-", " 65"); // "A"
353  TEST_CS_TEXT("A-C", " 65 67"); // "AC"
354 
355  TEST_CS_EQUALS("----------", " 0 0 0 0 0 0 0 0 0 0 [0]", " 10");
356 
357  // one base
358  TEST_CS_EQUALS("A---------", " 0 1 1 1 1 1 1 1 1 1 [1]", " 0 10");
359  TEST_CS_EQUALS("-A--------", " 0 0 1 1 1 1 1 1 1 1 [1]", " 1 10");
360  TEST_CS_EQUALS("---A------", " 0 0 0 0 1 1 1 1 1 1 [1]", " 3 10");
361  TEST_CS_EQUALS("-----A----", " 0 0 0 0 0 0 1 1 1 1 [1]", " 5 10");
362  TEST_CS_EQUALS("-------A--", " 0 0 0 0 0 0 0 0 1 1 [1]", " 7 10");
363  TEST_CS_EQUALS("---------A", " 0 0 0 0 0 0 0 0 0 0 [1]", " 9 10");
364 
365  // two bases
366  TEST_CS_EQUALS("AC--------", " 0 1 2 2 2 2 2 2 2 2 [2]", " 0 1 10");
367 
368  TEST_CS_EQUALS("A-C-------", " 0 1 1 2 2 2 2 2 2 2 [2]", " 0 2 10");
369  TEST_CS_EQUALS("A--------C", " 0 1 1 1 1 1 1 1 1 1 [2]", " 0 9 10");
370  TEST_CS_EQUALS("-AC-------", " 0 0 1 2 2 2 2 2 2 2 [2]", " 1 2 10");
371  TEST_CS_EQUALS("-A------C-", " 0 0 1 1 1 1 1 1 1 2 [2]", " 1 8 10");
372  TEST_CS_EQUALS("-A-------C", " 0 0 1 1 1 1 1 1 1 1 [2]", " 1 9 10");
373  TEST_CS_EQUALS("-------A-C", " 0 0 0 0 0 0 0 0 1 1 [2]", " 7 9 10");
374  TEST_CS_EQUALS("--------AC", " 0 0 0 0 0 0 0 0 0 1 [2]", " 8 9 10");
375 
376  // 3 bases
377  TEST_CS_EQUALS("ACG-------", " 0 1 2 3 3 3 3 3 3 3 [3]", " 0 1 2 10");
378  TEST_CS_EQUALS("AC---G----", " 0 1 2 2 2 2 3 3 3 3 [3]", " 0 1 5 10");
379  TEST_CS_EQUALS("A-C--G----", " 0 1 1 2 2 2 3 3 3 3 [3]", " 0 2 5 10");
380  TEST_CS_EQUALS("A-C--G----", " 0 1 1 2 2 2 3 3 3 3 [3]", " 0 2 5 10");
381  TEST_CS_EQUALS("A--C-G----", " 0 1 1 1 2 2 3 3 3 3 [3]", " 0 3 5 10");
382  TEST_CS_EQUALS("A---CG----", " 0 1 1 1 1 2 3 3 3 3 [3]", " 0 4 5 10");
383 
384  TEST_CS_EQUALS("A---C---G-", " 0 1 1 1 1 2 2 2 2 3 [3]", " 0 4 8 10");
385  TEST_CS_EQUALS("A---C----G", " 0 1 1 1 1 2 2 2 2 2 [3]", " 0 4 9 10");
386  TEST_CS_EQUALS("A~~~C????G", " 0 1 1 1 1 2 2 2 2 2 [3]", " 0 4 9 10");
387 
388  // 4 bases
389  TEST_CS_EQUALS("-AC-G--T--", " 0 0 1 2 2 3 3 3 4 4 [4]", " 1 2 4 7 10");
390 
391  // 9 bases
392  TEST_CS_EQUALS("-CGTACGTAC", " 0 0 1 2 3 4 5 6 7 8 [9]", " 1 2 3 4 5 6 7 8 9 10");
393  TEST_CS_EQUALS("A-GTACGTAC", " 0 1 1 2 3 4 5 6 7 8 [9]", " 0 2 3 4 5 6 7 8 9 10");
394  TEST_CS_EQUALS("ACGTA-GTAC", " 0 1 2 3 4 5 5 6 7 8 [9]", " 0 1 2 3 4 6 7 8 9 10");
395  TEST_CS_EQUALS("ACGTACGT-C", " 0 1 2 3 4 5 6 7 8 8 [9]", " 0 1 2 3 4 5 6 7 9 10");
396  TEST_CS_EQUALS("ACGTACGTA-", " 0 1 2 3 4 5 6 7 8 9 [9]", " 0 1 2 3 4 5 6 7 8 10");
397 
398  // all 10 bases
399  TEST_CS_EQUALS("ACGTACGTAC", " 0 1 2 3 4 5 6 7 8 9 [10]", " 0 1 2 3 4 5 6 7 8 9 10");
400 
401  TEST_CS_EQUALS_OFFSET("A--C-G-", 0, " 0 1 1 1 2 2 3 [3]", " 0 3 5 7");
402  TEST_CS_EQUALS_OFFSET("A--C-G-", 2, " 0 0 0 1 1 1 2 2 3 [3]", " 2 5 7 9");
403  TEST_CS_EQUALS_OFFSET("A--C-G-", 3, " 0 0 0 0 1 1 1 2 2 3 [3]", " 3 6 8 10");
404  TEST_CS_EQUALS_OFFSET("A--C-G-", 4, " 0 0 0 0 0 1 1 1 2 2 3 [3]", " 4 7 9 11");
405 
406  // test no_of_gaps_before() and no_of_gaps_after()
407  TEST_GAPS_EQUALS_OFFSET("-AC---G", 0, " 1 0 3 0", " 0 3 0 [-1]");
408  TEST_GAPS_EQUALS_OFFSET(".AC..-G", 0, " 1 0 3 0", " 0 3 0 [-1]");
409  TEST_GAPS_EQUALS_OFFSET("~AC?\?-G", 0, " 1 0 3 0", " 0 3 0 [-1]");
410  TEST_GAPS_EQUALS_OFFSET("A--C-G-", 0, " 0 2 1 1", " 2 1 1 [-1]");
411  TEST_GAPS_EQUALS_OFFSET("A--C-G-", 1000, " 0 2 1 1", " 2 1 1 [-1]"); // is independent from offset
412 
413  // test dots
414  TEST_DOTS_EQUALS_OFFSET("----ACG--T--A-C--GT----", 0, " [-1]"); // no dots
415  TEST_DOTS_EQUALS_OFFSET("....ACG--T--A-C--GT....", 0, " [-1]"); // no extraordinary dots
416  TEST_DOTS_EQUALS_OFFSET("....ACG--T..A-C--GT....", 0, " 4 [-1]"); // i.e. "before base number 4"
417  TEST_DOTS_EQUALS_OFFSET("....ACG..T--A.C--GT....", 0, " 3 5 [-1]");
418  TEST_DOTS_EQUALS_OFFSET("....ACG..T~~A.C??GT....", 0, " 3 5 [-1]");
419 
420  TEST_DOTS_EQUALS_OFFSET("AC-----GTA-----CGT", 0, " [-1]");
421  // just care THAT dots occur in a gap, dont care how many
422  TEST_DOTS_EQUALS_OFFSET("A-.-G-...-C...T", 0, " 1 2 3 [-1]");
423  TEST_DOTS_EQUALS_OFFSET("A.--G...--C...T", 0, " 1 2 3 [-1]");
424  TEST_DOTS_EQUALS_OFFSET("A--.G--...C...T", 0, " 1 2 3 [-1]");
425 }
426 
427 #endif // UNIT_TESTS
428 
static CharPredicate pred_is_ali_gap(is_ali_gap)
int expdPosition(int cPos) const
Definition: seq_search.hxx:200
void restoreDots(CompactedSubSequence &slaveSequence)
Definition: seq_search.cxx:168
GB_warning(buffer)
AliDataPtr format(AliDataPtr data, const size_t wanted_len, GB_ERROR &error)
Definition: insdel.cxx:615
int rel_2_abs(int rel) const
Definition: BI_basepos.hxx:87
CompactedSequence(const char *text, int length, const char *name, int start_offset=0)
Definition: seq_search.cxx:52
int firstDotPosition() const
Definition: seq_search.hxx:210
va_end(argp)
#define MESSAGE_BUFFERSIZE
Definition: seq_search.cxx:19
const char * name() const
Definition: seq_search.hxx:192
int length() const
Definition: seq_search.hxx:88
char buffer[MESSAGE_BUFFERSIZE]
Definition: seq_search.cxx:34
int no_of_gaps_before(int cPos) const
Definition: seq_search.hxx:110
long rightOf() const
Definition: seq_search.hxx:280
FILE * seq
Definition: rns.c:46
long leftOf() const
Definition: seq_search.hxx:279
AliDataPtr after(AliDataPtr data, size_t pos)
Definition: insdel.cxx:593
int abs_2_rel(int abs) const
Definition: BI_basepos.hxx:76
fflush(stdout)
int chars
Definition: seq_search.cxx:38
va_start(argp, format)
int no_of_gaps_after(int cPos) const
Definition: seq_search.hxx:197
__ATTR__FORMAT(1) static void messagef(const char *format
va_list argp
Definition: seq_search.cxx:33
void correctUnalignedPositions()
Definition: seq_search.cxx:124
bool is_gap(char c)
Definition: seq_search.hxx:48
int no_of_gaps_before(int cPos) const
Definition: seq_search.hxx:196
bool is_ali_gap(char c)
Definition: seq_search.hxx:47
AliDataPtr before(AliDataPtr data, size_t pos)
Definition: insdel.cxx:592
#define MAX_TRIPLES
Definition: seq_search.hxx:41
#define __ATTR__REDUCED_OPTIMIZE
Definition: test_unit.h:83
#define IF_ASSERTION_USED(x)
Definition: arb_assert.h:308
const char * text() const
Definition: seq_search.hxx:264
fa_assert(chars< MESSAGE_BUFFERSIZE)
FastSearchSequence(const CompactedSubSequence &seq)
Definition: seq_search.cxx:103
int first_base_abspos() const
Definition: BI_basepos.hxx:97
#define NULp
Definition: cxxforward.h:116
int last_base_abspos() const
Definition: BI_basepos.hxx:98
int nextDotPosition() const
Definition: seq_search.hxx:224
void storeDots(int beforePos)
Definition: seq_search.hxx:133
void append(Dots *neu)
Definition: seq_search.cxx:45
int compPosition(int xPos) const
Definition: seq_search.hxx:205
const char * quality() const
Definition: seq_search.hxx:351
void setDotsAtEOSequence()
Definition: seq_search.cxx:197