ARB
Location.cxx
Go to the documentation of this file.
1 // ================================================================ //
2 // //
3 // File : Location.cxx //
4 // Purpose : //
5 // //
6 // Coded by Ralf Westram (coder@reallysoft.de) in November 2006 //
7 // Institute of Microbiology (Technical University Munich) //
8 // http://www.arb-home.de/ //
9 // //
10 // ================================================================ //
11 
12 #include "Location.h"
13 #include <arb_stdstr.h>
14 
15 #include <adGene.h>
16 #include <arbdb.h>
17 
18 #include <cctype>
19 #include <string>
20 
21 using namespace std;
22 
24 typedef vector<LocationPtr> LocationVector;
25 
27 
28 class SimpleLocation : public Location {
29  long pos1;
30  long pos2;
31  char uncertain1;
32  char uncertain2;
33 
34  static inline string one_pos_as_string(char uc, long p) {
35  string s;
36  if (uc != '=') s += uc;
37  return s += GBS_global_string("%li", p);
38  }
39 
40 public:
41  SimpleLocation(long p1, long p2, char u1, char u2)
42  : pos1(p1), pos2(p2), uncertain1(u1), uncertain2(u2)
43  {
44  arb_assert(u1 && u2);
45  }
46  SimpleLocation(long p, char u)
47  : pos1(p), pos2(p), uncertain1(u), uncertain2(u)
48  {
49  arb_assert(u == '='); // do not allow uncertainties with single position ctor
50  }
51 
52  int count() const OVERRIDE { return 1; }
54  bool isInRange(long p1, long p2) const OVERRIDE {
55  arb_assert(p1 <= p2);
56  return p1 <= pos1 && pos2 <= p2;
57  }
58  void save(GEN_position *pos, bool complementary) const OVERRIDE {
59  int p = pos->parts;
60 
61  pos->start_pos[p] = pos1;
62  pos->stop_pos[p] = pos2;
63  pos->complement[p] = char(complementary);
64  pos->start_uncertain[p] = uncertain1;
65  pos->stop_uncertain[p] = uncertain2;
66 
67  ++pos->parts;
68  }
69 
70  string as_string() const OVERRIDE {
71  if (uncertain1 == '+') {
72  if (uncertain2 != '-' || pos2 != (pos1+1)) throw "Invalid uncertainties";
73  return GBS_global_string("%li^%li", pos1, pos2);
74  }
75 
76  string s = one_pos_as_string(uncertain1, pos1);
77  if (pos1 != pos2) {
78  s += ".."+one_pos_as_string(uncertain2, pos2);
79  }
80  return s;
81  }
82 };
83 
84 class JoinedLocation : public Location {
85  LocationVector locations;
86  LocationJoinType joinType;
87 
88 public:
89  JoinedLocation(LocationJoinType jtype) : joinType(jtype) {}
90 
91  void push_back(const LocationPtr&loc) {
92  LocationJoinType loc_type = loc->getJoinType();
93  if (loc_type != LJT_NOT_JOINED && loc_type != joinType) {
94  throw "order() and join() cannot be mixed";
95  }
96  locations.push_back(loc);
97  }
98 
99  LocationJoinType getJoinType() const OVERRIDE { return joinType; }
100  int count() const OVERRIDE {
101  int Count = 0;
102 
103  LocationVectorCIter e = locations.end();
104  for (LocationVectorCIter i = locations.begin(); i != e; ++i) {
105  Count += (*i)->count();
106  }
107  return Count;
108  }
109 
110  bool isInRange(long p1, long p2) const OVERRIDE {
111  LocationVectorCIter e = locations.end();
112  for (LocationVectorCIter i = locations.begin(); i != e; ++i) {
113  if (!(*i)->isInRange(p1, p2)) {
114  return false;
115  }
116  }
117  return true;
118  }
119 
120  void save(GEN_position *pos, bool complementary) const OVERRIDE {
121  if (complementary) {
122  LocationVectorCRIter e = locations.rend();
123  for (LocationVectorCRIter i = locations.rbegin(); i != e; ++i) {
124  (*i)->save(pos, complementary);
125  }
126  }
127  else {
128  LocationVectorCIter e = locations.end();
129  for (LocationVectorCIter i = locations.begin(); i != e; ++i) {
130  (*i)->save(pos, complementary);
131  }
132  }
133  }
134  string as_string() const OVERRIDE {
135  string joined;
136  switch (joinType) {
137  case LJT_JOIN: joined = "join"; break;
138  case LJT_ORDER: joined = "order"; break;
139  default: throw "Unhandled join type"; break;
140  }
141  joined += '(';
142 
143  LocationVectorCIter e = locations.end();
144  LocationVectorCIter i = locations.begin();
145  if (i != e) {
146  joined += (*i++)->as_string();
147  for (; i != e; ++i) {
148  joined += ',';
149  joined += (*i)->as_string();
150  }
151  }
152  joined += ')';
153  return joined;
154  }
155 };
156 
157 class ComplementLocation : public Location {
158  LocationPtr location;
159 public:
160  ComplementLocation(const LocationPtr& loc) : location(loc) {}
161 
162  int count() const OVERRIDE { return location->count(); }
163  bool isInRange(long p1, long p2) const OVERRIDE { return location->isInRange(p1, p2); }
164  void save(GEN_position *pos, bool complementary) const OVERRIDE { location->save(pos, !complementary); }
165  LocationJoinType getJoinType() const OVERRIDE { return location->getJoinType(); }
166  string as_string() const OVERRIDE { return string("complement(")+location->as_string()+')'; }
167 };
168 
169 // --------------------------------------------------------------------------------
170 
171 static size_t parsePosition(const string& source, char& uncertain) {
172  // parses one position and returns the value
173  // if position contains uncertainties, they are stored in 'uncertain' (as '<' or '>')
174 
175  const char *s = source.c_str();
176  size_t slen = source.length();
177 
178  if (s[0] == '>' or s[0] == '<') {
179  uncertain = s[0];
180  s++;
181  slen--;
182  }
183  else {
184  uncertain = '=';
185  }
186 
187  char *end;
188  size_t pos = strtoul(s, &end, 10);
189 
190  size_t converted = end-s;
191  if (converted<slen) {
192  throw string("Unexpected char '")+end[0]+"' in '"+source+"'";
193  }
194 
195  return pos;
196 }
197 
198 static void parseLocationList(const string& source, size_t startPos, LocationVector& locvec) {
199  size_t comma = source.find_first_of("(,", startPos);
200 
201  while (comma != string::npos && source[comma] == '(') {
202  size_t pos = comma+1;
203  size_t paren_count = 1;
204 
205  while (paren_count>0) {
206  size_t paren = source.find_first_of("()", pos);
207  if (paren == string::npos) {
208  throw GBS_global_string("Expected %zu closing parenthesis in '%s'", paren_count, source.c_str());
209  }
210  if (source[paren] == ')') paren_count--;
211  else paren_count++;
212 
213  pos = paren+1;
214  }
215  comma = source.find_first_of("(,", pos);
216  }
217 
218  if (comma == string::npos) { // no comma on top level
219  locvec.push_back(parseLocation(source.substr(startPos)));
220  }
221  else {
222  arb_assert(source[comma] == ',');
223  locvec.push_back(parseLocation(source.substr(startPos, comma-startPos)));
224  parseLocationList(source, comma+1, locvec); // continue after comma
225  }
226 }
227 
228 static bool parseInfix(const string &str, const string& prefix, const string& postfix, string& foundInfix) {
229  bool parsed = false;
230  if (beginsWith(str, prefix) && endsWith(str, postfix)) {
231  size_t strlen = str.length();
232  size_t prelen = prefix.length();
233  size_t postlen = postfix.length();
234 
235  if (strlen >= (prelen+postlen)) { // otherwise str is to short (prefix and postfix overlap)
236  foundInfix = str.substr(prelen, strlen-(prelen+postlen));
237  parsed = true;
238  }
239  }
240  return parsed;
241 }
242 
243 LocationPtr parseLocation(const string& source) {
244  char first = source[0];
245  if (first == 'c') {
246  string infix;
247  if (parseInfix(source, "complement(", ")", infix)) {
248  return new ComplementLocation(parseLocation(infix));
249  }
250  }
251  else if (first == 'j' || first == 'o') {
252  string infix;
253  LocationJoinType joinType = LJT_UNDEF;
254 
255  if (parseInfix(source, "join(", ")", infix)) {
256  joinType = LJT_JOIN;
257  }
258  else if (parseInfix(source, "order(", ")", infix)) {
259  joinType = LJT_ORDER;
260  }
261 
262  if (joinType != LJT_UNDEF) {
263  LocationVector locvec;
264  parseLocationList(infix, 0, locvec);
265 
266  JoinedLocation *join = new JoinedLocation(joinType);
267  LocationPtr res = join;
268 
269  LocationVectorCIter e = locvec.end();
270  for (LocationVectorCIter i = locvec.begin(); i != e; ++i) {
271  join->push_back(*i);
272  }
273  return res;
274  }
275  }
276  else if (isdigit(first) || strchr("<>", first)) {
277  size_t dots = source.find("..");
278  if (dots != string::npos) {
279  char uncertain1, uncertain2;
280  size_t pos1 = parsePosition(source.substr(0, dots), uncertain1);
281  size_t pos2 = parsePosition(source.substr(dots+2), uncertain2);
282 
283  return new SimpleLocation(pos1, pos2, uncertain1, uncertain2);
284  }
285 
286  size_t in_between = source.find("^");
287  if (in_between != string::npos) {
288  char uncertain1, uncertain2;
289  size_t pos1 = parsePosition(source.substr(0, in_between), uncertain1);
290  size_t pos2 = parsePosition(source.substr(in_between+1), uncertain2);
291 
292  if (uncertain1 == '=' && uncertain2 == '=' && pos2 == pos1+1) {
293  return new SimpleLocation(pos1, pos2, '+', '-');
294  }
295  throw string("Can only handle 'pos^pos+1'. Can't parse location '"+source+"'");
296  }
297  else {
298  // single base position
299  char uncertain;
300  size_t single_pos = parsePosition(source, uncertain);
301  if (uncertain != '=') throw "No uncertainty allowed for single positions";
302  return new SimpleLocation(single_pos, uncertain);
303  }
304  }
305 
306  throw string("Unparsable location '"+source+"'");
307 }
308 
310  GEN_position *pos = GEN_new_position(count(), getJoinType() == LJT_JOIN);
312 
313 #if defined(ASSERTION_USED)
314  int org_parts = pos->parts;
315 #endif // ASSERTION_USED
316 
317  pos->parts = 0; // misuse 'parts' as index for filling 'pos'
318  save(pos, false);
319 
320  arb_assert(pos->parts == org_parts);
321 
322  return pos;
323 }
324 
325 inline LocationPtr part2SimpleLocation(const GEN_position *pos, int i, bool inverseComplement) {
326  LocationPtr res = new SimpleLocation(pos->start_pos[i], pos->stop_pos[i], pos->start_uncertain[i], pos->stop_uncertain[i]);
327  if (contradicted(pos->complement[i], inverseComplement)) {;
328  res = new ComplementLocation(res);
329  }
330  return res;
331 }
332 
336 
337  if (gp->parts == 1) {
338  return part2SimpleLocation(gp, 0, false);
339  }
340 
341  int complemented = 0;
342  for (int p = 0; p<gp->parts; ++p) complemented += gp->complement[p]; // LOOP_VECTORIZED
343 
345  LocationPtr res = joined;
346 
347  if (2*complemented > gp->parts) {
348  // create "complement(join())" since it is "smaller"
349  for (int p = gp->parts-1; p >= 0; --p) {
350  LocationPtr ploc = part2SimpleLocation(gp, p, true);
351  joined->push_back(ploc);
352  }
353  res = new ComplementLocation(res);
354  }
355  else {
356  for (int p = 0; p<gp->parts; ++p) {
357  LocationPtr ploc = part2SimpleLocation(gp, p, false);
358  joined->push_back(ploc);
359  }
360  }
361  return res;
362 }
363 
364 // --------------------------------------------------------------------------------
365 
366 #ifdef UNIT_TESTS
367 #include <test_unit.h>
368 
369 static GB_ERROR perform_conversions(const string& in, string& out, string& out2) {
370  GB_ERROR error = NULp;
371  try {
372  LocationPtr loc = parseLocation(in);
373  out = loc->as_string();
374 
375  GEN_position *gp = loc->create_GEN_position();
376  TEST_REJECT_NULL(gp);
377 
378  LocationPtr reloc = to_Location(gp);
379  out2 = reloc->as_string();
380 
381  GEN_free_position(gp);
382  }
383  catch (std::string& err) { error = GBS_static_string(err.c_str()); }
384  catch (const char *&err) { error = GBS_static_string(err); }
385  return error;
386 }
387 
388 #define DO_LOCONV_NOERR(str) string reverse, convReverse; TEST_EXPECT_NO_ERROR(perform_conversions(str, reverse, convReverse));
389 
390 // the following assertions test
391 // - conversion from string->Location->string and
392 // - conversion from string->Location->GEN_position->Location->string
393 //
394 // TEST_EXPECT___CONV_IDENT expects both conversions equal input
395 // TEST_EXPECT___CONV__INTO expects 1st conversion changes to 'res'
396 // TEST_EXPECT_RECONV__INTO expects 2nd conversion changes to 'res'
397 // TEST_EXPECT_RECONV_INTO2 expects 1st conversion changes to 'res1' and 2nd to 'res2'
398 
399 #define TEST_EXPECT___CONV_IDENT(str) do { \
400  DO_LOCONV_NOERR(str); \
401  TEST_EXPECT_EQUAL(str, reverse.c_str()); \
402  TEST_EXPECT_EQUAL(str, convReverse.c_str()); \
403  } while(0)
404 
405 #define TEST_EXPECT___CONV_IDENT__BROKEN(str) do { \
406  DO_LOCONV_NOERR(str); \
407  TEST_EXPECT_EQUAL__BROKEN(str, reverse.c_str()); \
408  } while(0)
409 
410 #define TEST_EXPECT___CONV__INTO(str,res) do { \
411  DO_LOCONV_NOERR(str); \
412  TEST_EXPECT_EQUAL(res, reverse.c_str()); \
413  TEST_EXPECT_EQUAL(res, convReverse.c_str()); \
414  } while(0)
415 
416 #define TEST_EXPECT_RECONV__INTO(str,res) do { \
417  do { \
418  DO_LOCONV_NOERR(str); \
419  TEST_EXPECT_EQUAL(str, reverse.c_str()); \
420  TEST_EXPECT_EQUAL(res, convReverse.c_str()); \
421  TEST_EXPECT_MORE_EQUAL(reverse.length(), convReverse.length()); \
422  } while(0); \
423  TEST_EXPECT___CONV_IDENT(res); \
424  } while(0)
425 
426 #define TEST_EXPECT_RECONV_INTO2(str,res1,res2) do { \
427  DO_LOCONV_NOERR(str); \
428  TEST_EXPECT_EQUAL(res1, reverse.c_str()); \
429  TEST_EXPECT_EQUAL(res2, convReverse.c_str()); \
430  TEST_EXPECT_MORE_EQUAL(reverse.length(), convReverse.length()); \
431  } while(0)
432 
433 #define TEST_EXPECT__PARSE_ERROR(str,err) do { \
434  string reverse, convReverse; \
435  TEST_EXPECT_EQUAL(perform_conversions(str, reverse, convReverse), err); \
436  } while(0)
437 
438 __ATTR__REDUCED_OPTIMIZE void TEST_gene_location() {
439  // see also ../ARBDB/adGene.cxx@TEST_GEN_position
440 
441  TEST_EXPECT___CONV_IDENT("1725");
442  TEST_EXPECT__PARSE_ERROR("3-77", "Unexpected char '-' in '3-77'");
443  TEST_EXPECT___CONV_IDENT("3..77");
444  TEST_EXPECT___CONV_IDENT("77..3"); // @@@ could be interpreted as reverse (but not complement)
445 
446  TEST_EXPECT___CONV_IDENT("<3..77");
447  TEST_EXPECT___CONV_IDENT("3..>77");
448  TEST_EXPECT___CONV_IDENT(">3..<77");
449 
450  TEST_EXPECT___CONV_IDENT("7^8");
451  TEST_EXPECT__PARSE_ERROR("7^9", "Can only handle 'pos^pos+1'. Can't parse location '7^9'");
452 
453  TEST_EXPECT___CONV_IDENT("complement(3..77)");
454  TEST_EXPECT___CONV_IDENT("complement(77..3)");
455  TEST_EXPECT___CONV_IDENT("complement(77)");
456 
457  TEST_EXPECT___CONV_IDENT("join(3..77,100..200)");
458  TEST_EXPECT_RECONV__INTO("join(3..77)", "3..77");
459  TEST_EXPECT___CONV_IDENT("join(3..77,100,130..177)");
460  TEST_EXPECT__PARSE_ERROR("join(3..77,100..200, 130..177)", "Unparsable location ' 130..177'");
461 
462  TEST_EXPECT_RECONV__INTO("order(1)", "1");
463  TEST_EXPECT___CONV_IDENT("order(0,8,15)");
464  TEST_EXPECT___CONV_IDENT("order(10..12,7..9,1^2)");
465  TEST_EXPECT_RECONV__INTO("order(complement(1^2),complement(7..9),complement(10..12))",
466  "complement(order(10..12,7..9,1^2))");
467  TEST_EXPECT___CONV_IDENT("order(10..12,complement(7..9),1^2)");
468  TEST_EXPECT_RECONV__INTO("order(complement(1^2),7..9,complement(10..12))",
469  "complement(order(10..12,complement(7..9),1^2))");
470 
471  TEST_EXPECT__PARSE_ERROR("join(order(0,8,15),order(3,2,1))", "order() and join() cannot be mixed");
472 
473  TEST_EXPECT_RECONV__INTO("join(complement(3..77),complement(100..200))",
474  "complement(join(100..200,3..77))");
475  TEST_EXPECT_RECONV__INTO("join(complement(join(3..77,74..83)),complement(100..200))",
476  "complement(join(100..200,3..77,74..83))");
477  TEST_EXPECT_RECONV__INTO("complement(complement(complement(100..200)))",
478  "complement(100..200)");
479  TEST_EXPECT_RECONV__INTO("join(complement(join(complement(join(1)))))",
480  "1");
481 
482  // cover errors
483  TEST_EXPECT__PARSE_ERROR("join(abc()", "Expected 1 closing parenthesis in 'abc('");
484 
485  // strange behavior
486  TEST_EXPECT___CONV__INTO("", "0");
487  TEST_EXPECT_RECONV_INTO2("join()", "join(0)", "0");
488  TEST_EXPECT___CONV__INTO("complement()", "complement(0)");
489  TEST_EXPECT_RECONV_INTO2("complement(complement())", "complement(complement(0))", "0");
490 }
491 
492 #endif // UNIT_TESTS
#define arb_assert(cond)
Definition: arb_assert.h:245
static bool parseInfix(const string &str, const string &prefix, const string &postfix, string &foundInfix)
Definition: Location.cxx:228
void push_back(const LocationPtr &loc)
Definition: Location.cxx:91
SimpleLocation(long p, char u)
Definition: Location.cxx:46
unsigned char * complement
Definition: adGene.h:41
LocationJoinType getJoinType() const OVERRIDE
Definition: Location.cxx:99
vector< LocationPtr > LocationVector
Definition: Location.cxx:24
string as_string() const OVERRIDE
Definition: Location.cxx:166
return string(buffer, length)
int count() const OVERRIDE
Definition: Location.cxx:162
void GEN_free_position(GEN_position *pos)
Definition: adGene.cxx:195
virtual bool isInRange(long pos1, long pos2) const =0
int count() const OVERRIDE
Definition: Location.cxx:52
int count() const OVERRIDE
Definition: Location.cxx:100
static int pos1
Definition: ClustalV.cxx:58
int parts
Definition: adGene.h:37
string as_string() const OVERRIDE
Definition: Location.cxx:134
const char * GBS_global_string(const char *templat,...)
Definition: arb_msg.cxx:203
STL namespace.
LocationPtr parseLocation(const string &source)
Definition: Location.cxx:243
void GEN_use_uncertainties(GEN_position *pos)
Definition: adGene.cxx:184
LocationPtr part2SimpleLocation(const GEN_position *pos, int i, bool inverseComplement)
Definition: Location.cxx:325
DEFINE_ITERATORS(LocationVector)
static void parseLocationList(const string &source, size_t startPos, LocationVector &locvec)
Definition: Location.cxx:198
void save(GEN_position *pos, bool complementary) const OVERRIDE
Definition: Location.cxx:164
void save(GEN_position *pos, bool complementary) const OVERRIDE
Definition: Location.cxx:58
SmartPtr< Location > LocationPtr
Definition: Location.cxx:23
unsigned char * start_uncertain
Definition: adGene.h:53
virtual void save(GEN_position *into, bool complementary) const =0
virtual std::string as_string() const =0
void save(GEN_position *pos, bool complementary) const OVERRIDE
Definition: Location.cxx:120
LocationPtr to_Location(const GEN_position *gp)
Definition: Location.cxx:333
size_t * stop_pos
Definition: adGene.h:40
#define TEST_REJECT_NULL(n)
Definition: test_unit.h:1325
static void error(const char *msg)
Definition: mkptypes.cxx:96
bool isInRange(long p1, long p2) const OVERRIDE
Definition: Location.cxx:54
ComplementLocation(const LocationPtr &loc)
Definition: Location.cxx:160
LocationJoinType getJoinType() const OVERRIDE
Definition: Location.cxx:165
GEN_position * create_GEN_position() const
Definition: Location.cxx:309
string as_string() const OVERRIDE
Definition: Location.cxx:70
static int pos2
Definition: ClustalV.cxx:59
bool beginsWith(const std::string &str, const std::string &start)
Definition: arb_stdstr.h:22
#define __ATTR__REDUCED_OPTIMIZE
Definition: test_unit.h:83
GEN_position * GEN_new_position(int parts, bool joinable)
Definition: adGene.cxx:155
bool endsWith(const std::string &str, const std::string &postfix)
Definition: arb_stdstr.h:26
JoinedLocation(LocationJoinType jtype)
Definition: Location.cxx:89
LocationJoinType getJoinType() const OVERRIDE
Definition: Location.cxx:53
static size_t parsePosition(const string &source, char &uncertain)
Definition: Location.cxx:171
#define OVERRIDE
Definition: cxxforward.h:112
virtual int count() const =0
const char * GBS_static_string(const char *str)
Definition: arb_msg.cxx:212
size_t * start_pos
Definition: adGene.h:39
LocationJoinType
Definition: Location.h:35
#define NULp
Definition: cxxforward.h:116
virtual LocationJoinType getJoinType() const =0
bool joinable
Definition: adGene.h:38
bool isInRange(long p1, long p2) const OVERRIDE
Definition: Location.cxx:163
bool isInRange(long p1, long p2) const OVERRIDE
Definition: Location.cxx:110
unsigned char * stop_uncertain
Definition: adGene.h:54
SimpleLocation(long p1, long p2, char u1, char u2)
Definition: Location.cxx:41
GB_write_int const char s
Definition: AW_awar.cxx:154