ARB
probe_match_parser.cxx
Go to the documentation of this file.
1 // ==================================================================== //
2 // //
3 // File : probe_match_parser.cxx //
4 // Purpose : parse the results of a probe match //
5 // //
6 // //
7 // Coded by Ralf Westram (coder@reallysoft.de) in June 2004 //
8 // Copyright Department of Microbiology (Technical University Munich) //
9 // //
10 // Visit our web site at: http://www.arb-home.de/ //
11 // //
12 // ==================================================================== //
13 
14 #include "probe_match_parser.hxx"
15 
16 #include <arbdbt.h>
17 #include <arb_defs.h>
18 
19 #include <cctype>
20 #include <map>
21 
22 #define pm_assert(cond) arb_assert(cond)
23 
24 using namespace std;
25 
26 // ----------------
27 // column
28 
29 struct column {
30  const char *title; // column title (pointer into ProbeMatch_impl::headline)
31  int start_column, end_column;
32 
33  column() : title(NULp), start_column(-1), end_column(-1) {}
34  column(const char *t, int sc, int ec) : title(t), start_column(sc), end_column(ec) {}
35 };
36 
37 // -------------------------
38 // ProbeMatch_impl
39 
40 typedef map<const char*, column, charpLess> ColumnMap;
41 
42 class ProbeMatch_impl : virtual Noncopyable {
43  char *headline;
44  ColumnMap columns;
45  int probe_region_offset; // left index of probe region
46 
47 
48 public:
49  ProbeMatch_impl(const char *headline_, char **errPtr) :
50  headline(NULp),
51  probe_region_offset(-1)
52  {
53  pm_assert(headline_);
54  headline = ARB_strdup(headline_);
55 
56  for (char *tok_start = strtok(headline, " "); tok_start; tok_start = strtok(NULp, " ")) {
57  char *tok_end = strchr(tok_start, 0)-1;
58 
59  int startPos = tok_start-headline;
60  int endPos = tok_end-headline;
61 
62  while (tok_end >= tok_start && tok_end[0] == '-') --tok_end;
63  while (tok_start <= tok_end && tok_start[0] == '-') ++tok_start;
64  pm_assert(tok_start <= tok_end); // otherwise column only contained '-'
65  tok_end[1] = 0;
66 
67  columns[tok_start] = column(tok_start, startPos-2, endPos-2); // -2 because headline is 2 shorter than other lines
68  }
69 
70  if (columns.empty()) *errPtr = ARB_strdup("No columns found");
71  }
72 
74  free(headline);
75  }
76 
77  column *findColumn(const char *columntitle) {
78  ColumnMap::iterator ci = columns.find(columntitle);
79  if (ci == columns.end()) return NULp;
80  return &(ci->second);
81  }
82 
83  void set_probe_region_offset(int offset) { probe_region_offset = offset; }
84  int get_probe_region_offset() const { return probe_region_offset; }
85 };
86 
87 // --------------------------
88 // ProbeMatchParser
89 
90 ProbeMatchParser::ProbeMatchParser(const char *probe_target, const char *headline) :
91  pimpl(NULp),
92  init_error(NULp)
93 {
94  if (!headline) {
95  init_error = ARB_strdup("No headline given");
96  }
97  else if (!probe_target) {
98  init_error = ARB_strdup("No probe target given.");
99  }
100  else {
101  pimpl = new ProbeMatch_impl(headline, &init_error);
102  if (!init_error) {
103  // modify target, so that it matches the target string in headline
104  char *probe_target_copy = GBS_global_string_copy("'%s'", probe_target); // add single quotes
105  for (int i = 0; probe_target_copy[i]; ++i) {
106  probe_target_copy[i] = toupper(probe_target_copy[i]);
107  if (probe_target_copy[i] == 'T') { // replace 'T' by 'U'
108  probe_target_copy[i] = 'U';
109  }
110  }
111 
112  // find that column and
113  column *target_found = pimpl->findColumn(probe_target_copy);
114  if (!target_found) {
115  char *probe_rev_compl = ARB_strdup(probe_target_copy);
116  GBT_reverseComplementNucSequence(probe_rev_compl, strlen(probe_rev_compl), 'U');
117  target_found = pimpl->findColumn(probe_rev_compl);
118  free(probe_rev_compl);
119  }
120 
121  if (target_found) {
122  int probe_region_offset = target_found->start_column - 9;
123  pimpl->set_probe_region_offset(probe_region_offset);
124  }
125  else {
126  init_error = GBS_global_string_copy("Probe match parser failed (Could not find target '%s' in headline)", probe_target_copy);
127  }
128  free(probe_target_copy);
129  }
130  }
131 }
132 
134  free(init_error);
135  delete pimpl;
136 }
137 
138 bool ProbeMatchParser::getColumnRange(const char *columnName, int *startCol, int *endCol) const {
139  pm_assert(!init_error);
140  column *col = pimpl->findColumn(columnName);
141  if (!col) return false;
142 
143  *startCol = col->start_column;
144  *endCol = col->end_column;
145  return true;
146 }
147 
149  pm_assert(!init_error);
150  return pimpl->findColumn("organism") && pimpl->findColumn("genename");
151 }
152 
154  pm_assert(!init_error);
155  return pimpl->get_probe_region_offset();
156 }
157 
158 // --------------------------
159 // ParsedProbeMatch
160 
161 ParsedProbeMatch::ParsedProbeMatch(const char *match_, const ProbeMatchParser& parser_) :
162  parser(parser_),
163  match(NULp),
164  error(NULp)
165 {
166  if (match_) match = ARB_strdup(match_);
167  else error = "No match given";
168 }
169 
171  free(match);
172 }
173 
174 inline char *strpartdup(const char *str, int c1, int c2) {
175  int len = c2-c1+1;
176 
177  pm_assert(str);
178  pm_assert(c1 <= c2);
179  pm_assert((int)strlen(str) > c2);
180 
181  return ARB_strndup(str+c1, len);
182 }
183 
185  pm_assert(!error);
186  int c1, c2;
187  if (parser.getColumnRange("pos", &c1, &c2)) {
188  char *content = strpartdup(match, c1, c2);
189  int pos = bio2info(atoi(content));
190  free(content);
191  return pos;
192  }
193  error = "no such column: 'pos'";
194  return -1;
195 }
196 
198  pm_assert(!error);
199  int pro = parser.pimpl->get_probe_region_offset();
200  int matchlen = strlen(match);
201 
202  if (pro<matchlen) {
203  return match+pro;
204  }
205 
206  error = GBS_global_string("can't parse match info '%s'", match);
207  return NULp;
208 }
209 
210 char *ParsedProbeMatch::get_column_content(const char *columnName, bool chop_spaces) const {
211  pm_assert(!error);
212  int sc, ec;
213  if (parser.getColumnRange(columnName, &sc, &ec)) {
214  if (chop_spaces) {
215  while (sc<ec && match[sc] == ' ') ++sc;
216  while (sc<ec && match[ec] == ' ') --ec;
217  }
218  return strpartdup(match, sc, ec);
219  }
220  return NULp;
221 }
char * get_column_content(const char *columnName, bool chop_spaces) const
ParsedProbeMatch(const char *match_, const ProbeMatchParser &parser_)
char * ARB_strdup(const char *str)
Definition: arb_string.h:27
const char * GBS_global_string(const char *templat,...)
Definition: arb_msg.cxx:204
ProbeMatchParser(const char *probe_target, const char *headline)
STL namespace.
ProbeMatch_impl(const char *headline_, char **errPtr)
const char * get_probe_region() const
int get_probe_region_offset() const
NOT4PERL void GBT_reverseComplementNucSequence(char *seq, long length, char T_or_U)
Definition: adRevCompl.cxx:102
int get_probe_region_offset() const
map< const char *, column, charpLess > ColumnMap
static void error(const char *msg)
Definition: mkptypes.cxx:96
void set_probe_region_offset(int offset)
column(const char *t, int sc, int ec)
ASSERTING_CONSTEXPR_INLINE int bio2info(int biopos)
Definition: arb_defs.h:26
char * ARB_strndup(const char *start, int len)
Definition: arb_string.h:83
bool is_gene_result() const
bool getColumnRange(const char *columnName, int *startCol, int *endCol) const
#define pm_assert(cond)
#define NULp
Definition: cxxforward.h:97
const char * title
#define offset(field)
Definition: GLwDrawA.c:73
char * strpartdup(const char *str, int c1, int c2)
static int column
Definition: arb_a2ps.c:295
column * findColumn(const char *columntitle)
char * GBS_global_string_copy(const char *templat,...)
Definition: arb_msg.cxx:195