ARB
NT_validNameParser.cxx
Go to the documentation of this file.
1 /*
2  * Definition of all objects belonging to this version of
3  * the valid names text file
4  *
5  * 29. November 2002
6  *
7  * coded by Lothar Richter
8  *
9  * Copyright (C) 2002 Department of Microbiology (Technical University Munich)
10  */
11 
12 #if defined(DEVEL_LOTHAR)
13 #define DUMP
14 #endif // DEVEL_LOTHAR
15 
16 #include "NT_validNameParser.h"
17 #include "NT_local.h"
18 
19 #include <cstdlib>
20 #include <cstdlib>
21 #include <iostream>
22 #include <fstream>
23 
24 using namespace std;
25 
26 namespace validNames {
27 
28 
29  TokLPtr tokenize(const std::string& description, TokLPtr tokenLP) {
30  size_t tokenEnd = 0;
31  size_t tokenBegin = 0;
32 
33  while (tokenEnd != description.size()) { // CC : warning: comparison between signed and unsigned (tokenEnd sollte nicht 'int' sondern 'unsigned' sein)
34  tokenEnd = description.find_first_of(' ', tokenBegin);
35  if (tokenEnd == string::npos) tokenEnd = description.size();
36  int tokLength = tokenEnd - tokenBegin;
37  if (tokLength != 0) {
38  tokenLP->push_back(description.substr(tokenBegin, tokenEnd - tokenBegin));
39  }
40  tokenBegin = tokenEnd + 1;
41 
42  }
43  return tokenLP;
44  }
45 
46 
47 
48 
49 
50  Desco determineType(const string& descriptionString)
51  { // begin determineType
52 
53  DESCT actType = NOTYPE;
54  TokLPtr tokenLP = new TokL;
55  tokenLP = tokenize(descriptionString, tokenLP);
56  // remove all tokens in parentheses
57  {
58  TokL::iterator it = tokenLP->begin();
59  while (it != tokenLP->end()) {
60  if (((*it).at(0) == '(') && *it != string("(corrig.)")) it = tokenLP->erase(it);
61  else ++it;
62  }
63  }
64 
65  // check first word for upper case letters
66  string descNames[6]; // first the valid genus, species, subsp. then the other names
67  // stores occurrence of subsp. which is needed to retrieve the right tokens later on and status flags
68  int sspPos[2] = { 0, 0 }; // token subsp. occurs maximum twice
69  int ssp = 0;
70  bool isValid = true;
71  bool isRenamed = false;
72  bool isHetero = false;
73  bool isHomo = false;
74  bool isGenus = false;
75  // bool isSee = false;
76  bool isCorr = false;
77 
78 
79 
80  for (TokL::iterator it = tokenLP->begin(); it != tokenLP->end(); ++it, ++ssp) {
81  if (isUpperCase(*it)) {
82  isGenus = true;
83 #if defined(DUMP)
84  std::cout << "genus detected" << std::endl;
85 #endif // DUMP
86  }
87 
88 
89  else { // begin operators
90  if (*it == string("->")) {
91  nt_assert(!isHetero);
92  nt_assert(!isHomo);
93  nt_assert(isValid); // only one operator per line allowed
94  isRenamed = true;
95  isValid = false;
96 #if defined(DUMP)
97  std::cout << "renaming detected" << std::endl;
98 #endif // DUMP
99  }
100  else {
101  if (*it == string("=>")) {
102  nt_assert(!isRenamed);
103  nt_assert(!isHomo);
104  nt_assert(isValid);
105  isHetero = true;
106  isValid = false;
107 #if defined(DUMP)
108  std::cout << "heteronym detected" << std::endl;
109 #endif // DUMP
110  }
111  else {
112  if (*it == string("=")) {
113  nt_assert(!isRenamed);
114  nt_assert(!isHetero);
115  nt_assert(isValid);
116  isHomo = true;
117  isValid = false;
118 #if defined(DUMP)
119  std::cout << "homonym detected" << std::endl;
120 #endif // DUMP
121  }
122  else {
123  if (*it == string("(corrig.)")) {
124  isCorr = true;
125 #if defined(DUMP)
126  std::cout << "correction" << std::endl;
127 #endif // DUMP
128  }
129  else {
130  if (*it == string("see:")) {
131  // isSee = true;
132  isValid = false;
133 #if defined(DUMP)
134  std::cout << "reference" << std::endl;
135 #endif // DUMP
136  }
137  else {
138  if (*it == string("subsp.")) {
139 #if defined(DUMP)
140  std::cout << "subspecies detected at position: >>>" << ssp << "<<<" << std::endl;
141 #endif // DUMP
142  ssp == 2 ? sspPos[0] = ssp : sspPos[1] = ssp;
143  // max. one subsp. on each operator side
144 #if defined(DUMP)
145  std::cout << "position of subsp.: " << sspPos[0] << "\tand: " << sspPos[1] << std::endl;
146 #endif // DUMP
147  }
148  }
149  }
150  }
151  }
152  }
153  }
154  }
155 
156 
157 
158  if (isGenus) {
159 #if defined(DUMP)
160  std::cout << " GENUS description found " << std::endl;
161 #endif // DUMP
162  if (isValid) {
163  descNames[0] = (*tokenLP)[0];
164  actType = VALGEN;
165 #if defined(DUMP)
166  std::cout << "VALIDGEN type set to: " << actType << std::endl;
167 #endif// DUMP
168  }
169  else {
170  if (isHetero) {
171  descNames[0] = (*tokenLP)[2];
172  descNames[3] = (*tokenLP)[0];
173  actType = HETGEN;
174 #if defined(DUMP)
175  std::cout << "HETERONYMGEN type set to: " << actType << std::endl;
176 #endif // DUMP
177  }
178  else {
179  if (isHomo) {
180  descNames[0] = (*tokenLP)[2];
181  descNames[3] = (*tokenLP)[0];
182  actType = HOMGEN;
183 #if defined(DUMP)
184  std::cout << "HOMONYMGEN type set to: " << actType << std::endl;
185 #endif // DUMP
186 
187  }
188  else {
189 
190  if (isRenamed) {
191  descNames[0] = (*tokenLP)[2];
192  descNames[3] = (*tokenLP)[0];
193  actType = RENGEN;
194 #if defined(DUMP)
195  std::cout << "RENAMEDGEN type set to: " << actType << std::endl;
196 #endif // DUMP
197  }
198  else {
199 #if defined(DUMP)
200  std::cout << "no meaningful combination of conditions reached" << std::endl
201  << "for line: " << descriptionString << std::endl;
202  std::cout << "description type is set to NOTYPE: " << NOTYPE << std::endl;
203 #endif // DUMP
204  isValid = false;
205 #if defined(DUMP)
206  std::cout << "isValid set to false " << std::endl;
207 #endif // DUMP
208  actType = NOTYPE;
209  }
210  }
211  }
212  }
213  }
214  else {
215 
216  // just fancy experimental , maybe not 100% correct but looks good
217  if (!(((sspPos[0] == 0) || (sspPos[0] == 2)) && (((sspPos[1] > 4)&&(sspPos[1]< 9))||(sspPos[1]==0)))) {
218 #if defined(DUMP)
219  std::cout << "subsp. at strange position found in line:" << std::endl << descriptionString << endl;
220  std::cout << "description type is set to NOTYPE: " << NOTYPE << std::endl;
221 #endif // DUMP
222  isValid = false;
223 #if defined(DUMP)
224  std::cout << "isValid set to false " << std::endl;
225 #endif // DUMP
226  actType = NOTYPE;
227  }
228 
229  if (isValid) {
230  descNames[0] = (*tokenLP)[0];
231  descNames[1] = (*tokenLP)[1];
232  if (sspPos[0] != 0) { descNames[2] = (*tokenLP)[sspPos[0]+1]; } // only if subsp. exists
233  actType = VALSPEC;
234  }
235  else { // begin else isHetero
236  if (isHetero) {
237  descNames[0] = (*tokenLP)[3 + sspPos[0]];
238  descNames[1] = (*tokenLP)[4 + sspPos[0]];
239  if (sspPos[1]!=0) { descNames[2]=(*tokenLP)[6 + sspPos[0]]; } // only if subsp. exists
240 
241  descNames[3] = (*tokenLP)[0];
242  descNames[4] = (*tokenLP)[1];
243  if (sspPos[0]!=0) { descNames[5]=(*tokenLP)[sspPos[0]+1]; } // only if subsp. exists
244 
245  actType = HETSPEC;
246  }
247  else {
248  if (isHomo) {
249  descNames[0] = (*tokenLP)[3 + sspPos[0]];
250  descNames[1] = (*tokenLP)[4 + sspPos[0]];
251  if (sspPos[1]!=0) { descNames[2]=(*tokenLP)[6 + sspPos[0]]; } // only if subsp. exists
252 
253  descNames[3] = (*tokenLP)[0];
254  descNames[4] = (*tokenLP)[1];
255  if (sspPos[0]!=0) { descNames[5]=(*tokenLP)[sspPos[0]+1]; } // only if subsp. exists
256 
257  actType = HOMSPEC;
258 
259  }
260  else { // else branch isHomo
261  if (isRenamed) {
262  descNames[0] = (*tokenLP)[3 + sspPos[0]];
263  descNames[1] = (*tokenLP)[4 + sspPos[0]];
264  if (sspPos[1]!=0) { descNames[2]=(*tokenLP)[6 + sspPos[0]]; } // only if subsp. exists
265 
266  descNames[3] = (*tokenLP)[0];
267  descNames[4] = (*tokenLP)[1];
268  if (sspPos[0]!=0) { descNames[5]=(*tokenLP)[sspPos[0]+1]; } // only if subsp. exists
269 
270  actType = RENSPEC;
271 
272  }
273  else { // species remaining cases
274 #if defined(DUMP)
275  std::cout << "not a valid description line detected" << std::endl;
276  std::cout << "isValid: " << isValid << std::endl;
277  std::cout << "isRenamed: " << isRenamed << std::endl;
278  std::cout << "isHetero: " << isHetero << std::endl;
279  std::cout << "isHomo: " << isHomo << std::endl;
280  std::cout << "isGenus: " << isGenus << std::endl;
281  std::cout << "isSee: " << isSee << std::endl;
282  std::cout << "isCorr: " << isCorr << std::endl;
283  std::cout << "sspPos: " << sspPos[0] << " and " << sspPos[1] << std::endl;
284  std::cout << descriptionString << std::endl;
285 #endif // DUMP
286  actType = NOTYPE;
287  }
288 
289  }
290  }
291  }
292  }
293 
294 
295 #if defined(DUMP)
296  std::cout << descriptionString << std::endl;
297  std::cout << "classified as " << actType << std::endl;
298 #endif // DUMP
299 
300  Desco actDesc(actType, isCorr, descNames[0], descNames[1], descNames[2], descNames[3], descNames[4], descNames[5]);
301  delete tokenLP;
302  return actDesc;
303  }
304 
305 
306  string Desco::getFirstName() {
307  string tmp = firstgen;
308  if (!firstspec.empty()) {
309  tmp = tmp + " " + firstspec;
310  if (!firstsub.empty()) {
311  tmp = tmp + " " + "subsp." + " " + firstsub;
312  }
313  }
314 
315 
316  return tmp;
317  }
318 
319  string Desco::getSecondName() {
320  string tmp = secondgen;
321  if (!secondspec.empty()) {
322  tmp = tmp + " " + firstspec;
323  if (!secondsub.empty()) {
324  tmp = tmp + " " + "subsp." + " " + secondsub;
325  }
326  }
327  return tmp;
328  }
329 
330 
331  bool isUpperCase(const string& input) {
332  for (size_t i=0; i<input.length(); ++i) {
333  if (input[i]<'A' || input[i]>'Z') return false;
334  }
335  return true;
336  }
337 }
return string(buffer, length)
STL namespace.
Desco determineType(const string &descriptionString)
bool isUpperCase(const string &input)
#define nt_assert(cond)
Definition: NT_local.h:27
TokLPtr tokenize(const std::string &description, TokLPtr tokenLP)
std::vector< std::string > TokL