ARB
arb_match.cxx
Go to the documentation of this file.
1 // ================================================================= //
2 // //
3 // File : arb_match.cxx //
4 // Purpose : POSIX ERE //
5 // //
6 // Coded by Ralf Westram (coder@reallysoft.de) in September 2013 //
7 // Institute of Microbiology (Technical University Munich) //
8 // http://www.arb-home.de/ //
9 // //
10 // ================================================================= //
11 
12 // AISC_MKPT_PROMOTE:#ifndef ARB_CORE_H
13 // AISC_MKPT_PROMOTE:#include "arb_core.h"
14 // AISC_MKPT_PROMOTE:#endif
15 
16 #include "arb_match.h"
17 #include "arb_msg.h"
18 #include "arb_string.h"
19 #include "arb_strbuf.h"
20 
21 #include <regex.h>
22 
23 // ---------------------------------------------
24 // Regular Expressions search/replace
25 
26 struct GBS_regex { regex_t compiled; }; // definition exists twice (see ../SL/REGEXPR/RegExpr.cxx)
27 
28 inline char *give_buffer(size_t size) {
29  static char *buf = NULp;
30  static size_t bufsize = 0;
31 
32  if (size<1) size = 1;
33  if (bufsize<size) {
34  bufsize = size;
35  freeset(buf, ARB_alloc<char>(bufsize));
36  }
37  return buf;
38 }
39 
40 GBS_regex *GBS_compile_regexpr(const char *regexpr, GB_CASE case_flag, GB_ERROR *error) {
41  GBS_regex *comreg = ARB_alloc<GBS_regex>(1);
42  int cflags = REG_EXTENDED|(case_flag == GB_IGNORE_CASE ? REG_ICASE : 0)|REG_NEWLINE;
43  int errcode = regcomp(&comreg->compiled, regexpr, cflags);
44 
45  if (errcode != 0) { // error compiling regexpr
46  size_t size = regerror(errcode, &comreg->compiled, NULp, 0);
47  char *buf = give_buffer(size);
48 
49  regerror(errcode, &comreg->compiled, buf, size);
50  *error = buf;
51 
52  free(comreg);
53  comreg = NULp;
54  }
55  else {
56  *error = NULp;
57  }
58 
59  return comreg;
60 }
61 
62 void GBS_free_regexpr(GBS_regex *toFree) {
63  if (toFree) {
64  regfree(&toFree->compiled);
65  free(toFree);
66  }
67 }
68 
69 const char *GBS_unwrap_regexpr(const char *regexpr_in_slashes, GB_CASE *case_flag, GB_ERROR *error) {
70  /* unwraps 'expr' from '/expr/[i]'
71  * if slashes are not present, 'error' is set
72  * 'case_flag' is set to GB_MIND_CASE if format is '/expr/' or
73  * to GB_IGNORE_CASE if format is '/expr/i'
74  *
75  * returns a pointer to a static buffer (containing the unwrapped expression)
76  * (Note: The content is invalidated by the next call to GBS_unwrap_regexpr)
77  */
78 
79  arb_assert(error && !*error);
80 
81  const char *result = NULp;
82  const char *end = strchr(regexpr_in_slashes, 0);
83 
84  if (end >= (regexpr_in_slashes+3)) {
85  *case_flag = GB_MIND_CASE;
86  if (end[-1] == 'i') {
87  *case_flag = GB_IGNORE_CASE;
88  end--;
89  }
90  if (regexpr_in_slashes[0] == '/' && end[-1] == '/') {
91  arb_assert(!*error);
92 
93  static char *result_buffer = NULp;
94  static size_t max_len = 0;
95 
96  size_t len = end-regexpr_in_slashes-2;
97  arb_assert(len>0); // don't accept empty expression
98 
99  if (len>max_len) {
100  max_len = len*3/2;
101  freeset(result_buffer, ARB_alloc<char>(max_len+1));
102  }
103 
104  memcpy(result_buffer, regexpr_in_slashes+1, len);
105  result_buffer[len] = 0;
106 
107  result = result_buffer;
108  }
109  }
110 
111  if (!result) {
112  *error = GBS_global_string("Regular expression format is '/expr/' or '/expr/i', not '%s'",
113  regexpr_in_slashes);
114  }
115 
116  arb_assert(contradicted(result, *error));
117  return result;
118 }
119 
120 const char *GBS_regmatch_compiled(const char *str, GBS_regex *comreg, size_t *matchlen) {
121  /* like GBS_regmatch,
122  * - but uses a precompiled regular expression
123  * - no errors can occur here (beside out of memory, which is not handled)
124  */
125 
126  regmatch_t match;
127  int res = regexec(&comreg->compiled, str, 1, &match, 0);
128  const char *matchpos = NULp;
129 
130  if (res == 0) { // matched
131  matchpos = str+match.rm_so;
132  if (matchlen) *matchlen = match.rm_eo-match.rm_so;
133  }
134 
135  return matchpos;
136 }
137 
138 const char *GBS_regmatch(const char *str, const char *regExpr, size_t *matchlen, GB_ERROR *error) {
139  /* searches 'str' for first occurrence of 'regExpr'
140  * 'regExpr' has to be in format "/expr/[i]", where 'expr' is a POSIX extended regular expression
141  *
142  * for regexpression format see http://help.arb-home.de/reg.html#Syntax_of_POSIX_extended_regular_expressions_as_used_in_ARB
143  *
144  * returns
145  * - pointer to start of first match in 'str' and
146  * length of match in 'matchlen' ('matchlen' may be NULp, then no len is reported)
147  * or
148  * - NULp if nothing matched (in this case 'matchlen' is undefined)
149  *
150  * 'error' will be set if sth is wrong
151  *
152  * Note: Only use this function if you do exactly ONE match.
153  * Use GBS_regmatch_compiled if you use the regexpr twice or more!
154  */
155  arb_assert(error && !*error);
156 
157  const char *firstMatch = NULp;
158  GB_CASE case_flag;
159  const char *unwrapped_expr = GBS_unwrap_regexpr(regExpr, &case_flag, error);
160 
161  if (unwrapped_expr) {
162  GBS_regex *comreg = GBS_compile_regexpr(unwrapped_expr, case_flag, error);
163  if (comreg) {
164  firstMatch = GBS_regmatch_compiled(str, comreg, matchlen);
165  GBS_free_regexpr(comreg);
166  }
167  }
168 
169  arb_assert(implicated(firstMatch, !*error));
170  arb_assert(implicated(*error, !firstMatch));
171 
172  return firstMatch;
173 }
174 
175 char *GBS_regreplace(const char *str, const char *regReplExpr, GB_ERROR *error) {
176  /* search and replace all matches in 'str' using POSIX extended regular expression
177  * 'regReplExpr' has to be in format '/regexpr/replace/[i]'
178  *
179  * returns
180  * - a heap copy of the modified string or
181  * - NULp if something went wrong (in this case 'error' contains the reason)
182  *
183  * 'replace' may contain several special substrings:
184  *
185  * "\n" gets replaced by '\n'
186  * "\t" -------''------- '\t'
187  * "\\" -------''------- '\\'
188  * "\0" -------''------- the complete match to regexpr
189  * "\1" -------''------- the match to the first subexpression
190  * "\2" -------''------- the match to the second subexpression
191  * ...
192  * "\9" -------''------- the match to the ninth subexpression
193  */
194 
195  // test performed via ACI in ../ARBDB/gb_aci.cxx@GBS_REGREPLACE_TESTS
196 
197  arb_assert(error && !*error);
198 
199  GB_CASE case_flag;
200  const char *unwrapped_expr = GBS_unwrap_regexpr(regReplExpr, &case_flag, error);
201  char *result = NULp;
202 
203  if (unwrapped_expr) {
204  const char *sep = unwrapped_expr;
205  while (sep) {
206  sep = strchr(sep, '/');
207  if (!sep) break;
208  if (sep>unwrapped_expr && sep[-1] != '\\') break;
209  ++sep;
210  }
211 
212  if (!sep) {
213  // Warning: GB_command_interpreter() tests for this error message - don't change
214  *error = "Missing '/' between search and replace string";
215  }
216  else {
217  char *regexpr = ARB_strpartdup(unwrapped_expr, sep-1);
218  char *replexpr = ARB_strpartdup(sep+1, NULp);
219  GBS_regex *comreg = GBS_compile_regexpr(regexpr, case_flag, error);
220 
221  const bool exprMatchesBorderOfLine = regexpr[0] == '^' || sep[-1] == '$';
222 
223  if (comreg) {
224  GBS_strstruct out(1000);
225  int eflags = 0;
226 
227  while (str) {
228  regmatch_t match[10];
229  int res = regexec(&comreg->compiled, str, 10, match, eflags);
230 
231  if (res == REG_NOMATCH) { // did not match
232  out.cat(str); // copy original
233  str = NULp;
234  }
235  else if (match[0].rm_so == match[0].rm_eo && !exprMatchesBorderOfLine) { // found empty match
236  *error = GBS_global_string("The regular expression '%s' matched an empty string (not allowed for replace)", regexpr);
237  str = NULp;
238  }
239  else { // found non-empty match
240  size_t p;
241  char c;
242 
243  out.ncat(str, match[0].rm_so);
244 
245  for (p = 0; (c = replexpr[p]); ++p) {
246  if (c == '\\') {
247  c = replexpr[++p];
248  if (!c) break;
249  if (c >= '0' && c <= '9') {
250  regoff_t start = match[c-'0'].rm_so;
251  out.ncat(str+start, match[c-'0'].rm_eo-start);
252  }
253  else {
254  switch (c) {
255  case 'n': c = '\n'; break;
256  case 't': c = '\t'; break;
257  default: break;
258  }
259  out.put(c);
260  }
261  }
262  else {
263  out.put(c);
264  }
265  }
266 
267  str = str+match[0].rm_eo; // continue behind match
268  eflags = REG_NOTBOL|REG_NOTEOL; // for futher matches, do not regard 'str' as "beginning/end of line"
269  }
270  }
271 
272  GBS_free_regexpr(comreg);
273  if (!*error) {
274  result = out.release();
275  }
276  }
277  free(replexpr);
278  free(regexpr);
279  }
280  }
281 
282  arb_assert(contradicted(result, *error));
283 
284  return result;
285 }
286 
#define arb_assert(cond)
Definition: arb_assert.h:245
const char * GB_ERROR
Definition: arb_core.h:25
string result
#define implicated(hypothesis, conclusion)
Definition: arb_assert.h:289
GBS_regex * GBS_compile_regexpr(const char *regexpr, GB_CASE case_flag, GB_ERROR *error)
Definition: arb_match.cxx:40
const char * GBS_unwrap_regexpr(const char *regexpr_in_slashes, GB_CASE *case_flag, GB_ERROR *error)
Definition: arb_match.cxx:69
const char * GBS_global_string(const char *templat,...)
Definition: arb_msg.cxx:203
char * release()
Definition: arb_strbuf.h:129
char * GBS_regreplace(const char *str, const char *regReplExpr, GB_ERROR *error)
Definition: arb_match.cxx:175
void cat(const char *from)
Definition: arb_strbuf.h:199
char * ARB_strpartdup(const char *start, const char *end)
Definition: arb_string.h:51
const char * GBS_regmatch(const char *str, const char *regExpr, size_t *matchlen, GB_ERROR *error)
Definition: arb_match.cxx:138
static HelixNrInfo * start
static void error(const char *msg)
Definition: mkptypes.cxx:96
GB_CASE
Definition: arb_core.h:30
const char * GBS_regmatch_compiled(const char *str, GBS_regex *comreg, size_t *matchlen)
Definition: arb_match.cxx:120
void ncat(const char *from, size_t count)
Definition: arb_strbuf.h:189
char * give_buffer(size_t size)
Definition: arb_match.cxx:28
void GBS_free_regexpr(GBS_regex *toFree)
Definition: arb_match.cxx:62
#define NULp
Definition: cxxforward.h:116
regex_t compiled
Definition: arb_match.cxx:26
void put(char c)
Definition: arb_strbuf.h:174