ARB
RegExpr.cxx
Go to the documentation of this file.
1 // ============================================================= //
2 // //
3 // File : RegExpr.cxx //
4 // Purpose : Wrapper for ARBDB regular expressions //
5 // //
6 // Coded by Ralf Westram (coder@reallysoft.de) in April 2009 //
7 // Institute of Microbiology (Technical University Munich) //
8 // http://www.arb-home.de/ //
9 // //
10 // ============================================================= //
11 
12 #include "RegExpr.hxx"
13 
14 #include <arb_match.h>
15 #include <arb_mem.h>
16 
17 #include <regex.h>
18 
19 using namespace std;
20 
21 struct GBS_regex { regex_t compiled; }; // definition exists twice (see ../../ARBDB/admatch.c)
22 
23 RegExpr::RegExpr(const std::string& expression_, bool ignore_case_) :
24  expression(expression_),
25  ignore_case(ignore_case_),
26  comreg(NULp),
27  matches(NULp),
28  failure(NULp)
29 {}
30 
32  if (comreg) GBS_free_regexpr(comreg);
33  if (failure) delete failure;
34  delete [] matches;
35 }
36 
37 bool RegExpr::compile() const {
38  if (!comreg && !failure) {
39  delete [] matches; matches = NULp;
40 
42  comreg = GBS_compile_regexpr(expression.c_str(), ignore_case ? GB_IGNORE_CASE : GB_MIND_CASE, &error);
43  if (error) {
44  failure = new string(error);
45  }
46  re_assert(contradicted(comreg, failure));
47  }
48  return comreg;
49 }
50 
51 void RegExpr::perform_match(const char *str, size_t offset) const {
52  /* Searches for first match (and submatches) in 'str'
53  *
54  * sets member 'matches' to array of match + subexpression matches (heap-copy)
55  * or to NULp if nothing matched
56  *
57  * If 'offset' > 0, then str is searched from position 'offset'.
58  * In this case it is assumed, that we are not at line start!
59  */
60 
61  delete [] matches; matches = NULp;
62 
63  size_t subs = subexpr_count();
64  regmatch_t *possMatch = ARB_alloc<regmatch_t>(subs+1);
65  int eflags = offset ? REG_NOTBOL : 0;
66  int res = regexec(&comreg->compiled, str+offset, subs+1, possMatch, eflags);
67 
68  if (res != REG_NOMATCH) {
69  matches = new RegMatch[subs+1];
70  for (size_t s = 0; s <= subs; s++) {
71  if (possMatch[s].rm_so != -1) { // real match
72  matches[s] = RegMatch(possMatch[s].rm_so+offset, possMatch[s].rm_eo+offset);
73  }
74  }
75  re_assert(matches[0].didMatch()); // complete match has to be found
76  }
77  free(possMatch);
78 }
79 
80 const RegMatch *RegExpr::match(const std::string& versus, size_t offset) const {
81  if (!comreg) {
82  // lazy compilation
83  if (!compile()) return NULp;
84  }
85  perform_match(versus.c_str(), offset);
86  return (matches && matches[0].didMatch()) ? &matches[0] : NULp;
87 }
88 
89 size_t RegExpr::subexpr_count() const {
90  if (!comreg) {
91  // lazy compilation
92  if (!compile()) return 0;
93  }
94  return comreg->compiled.re_nsub;
95 }
96 
97 const RegMatch *RegExpr::subexpr_match(size_t subnr) const {
98  // get subexpression match from last 'match()'
99  // (or NULp if subexpression 'subnr' did not match)
100  //
101  // 'subnr' is in range [1..subexpr_count()]
102 
103  const RegMatch *result = NULp;
104  if (matches) {
105  size_t subs = subexpr_count();
106  re_assert(subnr >= 1 && subnr <= subs); // illegal subexpression index
107  if (subnr >= 1 && subnr <= subs) {
108  if (matches[subnr].didMatch()) result = &matches[subnr];
109  }
110  }
111  return result;
112 }
113 
114 // --------------------------------------------------------------------------------
115 
116 #ifdef UNIT_TESTS
117 #ifndef TEST_UNIT_H
118 #include <test_unit.h>
119 #endif
120 
121 static arb_test::match_expectation got_a_match(const RegExpr& exp, const RegMatch *match) {
122  using namespace arb_test;
123  const string *expression_failed = exp.has_failed();
124 
125  expectation_group expected(that(match).does_differ_from_NULL());
126  expected.add(that(expression_failed).is_equal_to_NULL());
127  if (expression_failed) {
128  const char *expression_failed_msg = expression_failed->c_str();
129  expected.add(that(expression_failed_msg).is_equal_to("(unwanted)"));
130  }
131  return all().ofgroup(expected);
132 }
133 static arb_test::match_expectation got_no_match(const RegExpr& exp, const RegMatch *match) {
134  using namespace arb_test;
135  const string *expression_failed = exp.has_failed();
136 
137  expectation_group expected(that(match).is_equal_to_NULL());
138  expected.add(that(expression_failed).is_equal_to_NULL());
139  if (expression_failed) {
140  const char *expression_failed_msg = expression_failed->c_str();
141  expected.add(that(expression_failed_msg).is_equal_to("(unwanted)"));
142  }
143  return all().ofgroup(expected);
144 }
145 static arb_test::match_expectation fails_to_compile(const RegExpr& exp,const char *error) {
146  using namespace arb_test;
147  const string *expression_failed = exp.has_failed();
148 
149  expectation_group expected(that(expression_failed).does_differ_from_NULL());
150  if (expression_failed) {
151  const char *expression_failed_msg = expression_failed->c_str();
152  expected.add(that(expression_failed_msg).is_equal_to(error));
153  }
154  return all().ofgroup(expected);
155 }
156 
157 #define TEST_REGEX_FAILS_TO_COMPILE(regexpr,error) do{ \
158  RegExpr exp(regexpr, false); \
159  TEST_EXPECTATION(fails_to_compile(exp,error)); \
160  }while(0)
161 
162 #define TEST_REGEX_MATCHES(str,regexpr,igCase,exp_match) do { \
163  RegExpr exp(regexpr, igCase); \
164  const RegMatch *match = exp.match(str); \
165  TEST_EXPECTATION(got_a_match(exp,match)); \
166  TEST_EXPECT_EQUAL(match->extract(str).c_str(), exp_match); \
167  } while(0)
168 
169 #define TEST_REGEX_DOESNT_MATCH(str,regexpr,igCase) do { \
170  RegExpr exp(regexpr, igCase); \
171  const RegMatch *match = exp.match(str); \
172  TEST_EXPECTATION(got_no_match(exp,match)); \
173  } while(0)
174 
175 #define TEST_REGEX_MATCHES_SUB1(str,regexpr,igCase,exp_match,exp_sub1match) do { \
176  RegExpr exp(regexpr, igCase); \
177  const RegMatch *match = exp.match(str); \
178  TEST_EXPECTATION(got_a_match(exp,match)); \
179  TEST_EXPECT_EQUAL(match->extract(str).c_str(), exp_match); \
180  match = exp.subexpr_match(1); \
181  TEST_REJECT_NULL(match); \
182  TEST_EXPECT_EQUAL(match->extract(str).c_str(), exp_sub1match); \
183  } while(0)
184 
185 void TEST_regexpr() {
186  TEST_REGEX_MATCHES("bla", "^bla$", false, "bla");
187 
188  TEST_REGEX_DOESNT_MATCH("3;1406", "^bla$", true);
189 
190  // disable the following section at will (arb does not depend on it)
191 #if defined(DARWIN)
192  TEST_REGEX_FAILS_TO_COMPILE("*", "repetition-operator operand invalid");
193  TEST_REGEX_FAILS_TO_COMPILE("[[:fantasy:]]*", "invalid character class");
194  TEST_REGEX_FAILS_TO_COMPILE("x{2", "braces not balanced");
195  TEST_REGEX_FAILS_TO_COMPILE("x{2-5}", "invalid repetition count(s)");
196  TEST_REGEX_FAILS_TO_COMPILE("x{5,2}", "invalid repetition count(s)");
197 #else // !DARWIN
198  TEST_REGEX_FAILS_TO_COMPILE("*", "Invalid preceding regular expression");
199  TEST_REGEX_FAILS_TO_COMPILE("[[:fantasy:]]*", "Invalid character class name");
200  TEST_REGEX_FAILS_TO_COMPILE("x{2", "Unmatched \\{");
201  TEST_REGEX_FAILS_TO_COMPILE("x{2-5}", "Invalid content of \\{\\}");
202  TEST_REGEX_FAILS_TO_COMPILE("x{5,2}", "Invalid content of \\{\\}");
203 #endif
204 
205  // RESULT_MODIFIED_OSX: regex isn't portable :/
206 #if defined(DARWIN)
207  TEST_REGEX_FAILS_TO_COMPILE("^bla|", "empty (sub)expression");
208 #else // !DARWIN
209  TEST_REGEX_MATCHES("3;1406", "^bla|", true, "");
210 #endif
211 
212  TEST_REGEX_MATCHES_SUB1("3;1406", "^[0-9]+;([0-9]+)$", true, "3;1406", "1406");
213  TEST_REGEX_MATCHES_SUB1(" Find CAPITAL \t WoRD ", "[[:space:]]+([A-Z]+)[[:space:]]+", false, " CAPITAL \t ", "CAPITAL");
214  TEST_REGEX_MATCHES_SUB1("--ajsd--aksjdh----alsdjkasldja---", "-+([a-z]{5,8})-+", true, "--aksjdh----", "aksjdh");
215 }
216 TEST_PUBLISH(TEST_regexpr);
217 
218 #endif // UNIT_TESTS
219 
220 // --------------------------------------------------------------------------------
const RegMatch * subexpr_match(size_t subnr) const
Definition: RegExpr.cxx:97
string result
~RegExpr()
Definition: RegExpr.cxx:31
group_matcher all()
Definition: test_unit.h:1011
GBS_regex * GBS_compile_regexpr(const char *regexpr, GB_CASE case_flag, GB_ERROR *error)
Definition: arb_match.cxx:40
return string(buffer, length)
STL namespace.
RegExpr(const std::string &expression_, bool ignore_case)
Definition: RegExpr.cxx:23
#define TEST_PUBLISH(testfunction)
Definition: test_unit.h:1517
#define is_equal_to_NULL()
Definition: test_unit.h:1028
static void error(const char *msg)
Definition: mkptypes.cxx:96
const RegMatch * match(const std::string &versus, size_t offset=0) const
Definition: RegExpr.cxx:80
#define that(thing)
Definition: test_unit.h:1043
#define does_differ_from_NULL()
Definition: test_unit.h:1029
#define is_equal_to(val)
Definition: test_unit.h:1025
#define re_assert(cond)
Definition: refentries.h:38
const std::string * has_failed() const
Definition: RegExpr.hxx:78
void GBS_free_regexpr(GBS_regex *toFree)
Definition: arb_match.cxx:62
#define NULp
Definition: cxxforward.h:116
regex_t compiled
Definition: arb_match.cxx:26
#define offset(field)
Definition: GLwDrawA.c:73
size_t subexpr_count() const
Definition: RegExpr.cxx:89
GB_write_int const char s
Definition: AW_awar.cxx:154