ARB
MetaInfo.cxx
Go to the documentation of this file.
1 // ================================================================ //
2 // //
3 // File : MetaInfo.cxx //
4 // Purpose : //
5 // //
6 // Coded by Ralf Westram (coder@reallysoft.de) in November 2006 //
7 // Institute of Microbiology (Technical University Munich) //
8 // http://www.arb-home.de/ //
9 // //
10 // ================================================================ //
11 
12 #include "MetaInfo.h"
13 #include <RegExpr.hxx>
14 
15 
16 using namespace std;
17 
18 void Reference::add(const string& field, const string& content) {
19  gi_assert(!field.empty());
20  gi_assert(!content.empty());
21 
22  stringMapIter existing = entries.find(field);
23  if (existing != entries.end()) {
24  throw GBS_global_string("Duplicated reference entry for '%s'", field.c_str());
25  }
26  entries[field] = content;
27 }
28 
29 const string *Reference::get(const string& field) const {
30  stringMapCIter existing = entries.find(field);
31  return (existing != entries.end()) ? &existing->second : NULp;
32 }
33 
34 void Reference::getKeys(stringSet& keys) const {
35  stringMapCIter e = entries.end();
36  for (stringMapCIter i = entries.begin(); i != e; ++i) {
37  keys.insert(i->first);
38  }
39 }
40 
41 // --------------------------------------------------------------------------------
42 
43 typedef vector<Reference> RefVector;
45 
47  refs.push_back(Reference());
48  latest = &refs.back();
49  ref_count++;
50 }
51 
52 
53 void References::getKeys(stringSet& keys) const {
54  keys.clear();
55  RefVectorCIter e = refs.end();
56  for (RefVectorCIter i = refs.begin(); i != e; ++i) {
57  i->getKeys(keys);
58  }
59 }
60 
61 string References::tagged_content(const string& refkey) const {
62  string content;
63 
64  if (ref_count == 1) { // only one reference -> don't tag
65  RefVectorCIter i = refs.begin();
66  const string *ref_content = i->get(refkey);
67 
68  gi_assert(ref_content);
69  content = *ref_content;
70  }
71  else {
72  int count = 1;
73  RefVectorCIter e = refs.end();
74 
75  for (RefVectorCIter i = refs.begin(); i != e; ++i, ++count) {
76  const string *ref_content = i->get(refkey);
77  if (ref_content) {
78  if (!content.empty()) content.append(1, ' ');
79  content.append(GBS_global_string("[REF%i] ", count));
80  content.append(*ref_content);
81  }
82  }
83  }
84  return content;
85 }
86 
87 #if defined(DEBUG)
88 void References::dump() const {
89  stringSet keys;
90  getKeys(keys);
91  stringSetIter e = keys.end();
92 
93  for (stringSetIter i = keys.begin(); i != e; ++i) {
94  string tagged = tagged_content(*i);
95  printf("%s='%s'\n", i->c_str(), tagged.c_str());
96  }
97 }
98 #endif // DEBUG
99 
100 enum DBID_TYPE {
104 };
105 struct DBID {
106  const char *id;
108  const char *arb_field;
109 };
110 
111 // see http://www.ebi.ac.uk/embl/Documentation/User_manual/usrman.html#3_4_10_4
112 
113 static const DBID dbid_definition[] = { // accepted DBIDs (EMBL 'RX'-tag)
114  { "DOI", DBID_STANDARD, "doi_id" },
115  { "PUBMED", DBID_STANDARD, "pubmed_id" },
116  { "AGRICOLA", DBID_STANDARD, "agricola_id" },
117  { "MEDLINE", DBID_ACCEPT, "medline_id" }, // non-standard, but common
118 
119  { NULp, DBID_ILLEGAL, NULp }, // end marker
120 };
121 
122 void References::add_dbid(const string& content) {
123  // add embl 'RX' entry
124  //
125  // * 'content' has \n inserted at original line breaks and
126  // contains database references like 'MEDLINE; id.' or 'PUBMED; id.' etc.
127  // * Multiple database references may be concatenated (each starts on it's own line)
128  // * 'id' is possibly split up on several lines
129 
130  RegExpr reg_dbid("^([A-Z]+);\\s+|\n([A-Z]+);\\s+", false);
131  const RegMatch *dbid_start = reg_dbid.match(content);
132 
133  if (!dbid_start) {
134  if (reg_dbid.has_failed()) throw reg_dbid.get_error();
135  throw GBS_global_string("Expected database reference id (e.g. 'DOI; ' or 'PUBMED; ')");
136  }
137  else {
138  re_assert(reg_dbid.subexpr_count() == 2);
139  while (dbid_start) {
140  const RegMatch *sub = reg_dbid.subexpr_match(1);
141  if (!sub) sub = reg_dbid.subexpr_match(2);
142  re_assert(sub);
143 
144  string dbid = sub->extract(content);
145  size_t id_start = dbid_start->posBehindMatch();
146 
147  dbid_start = reg_dbid.match(content, id_start); // search for start of next db-id
148 
150  const char *arb_field = NULp;
151  for (int m = 0; ; m++) {
152  const char *name = dbid_definition[m].id;
153  if (!name) break;
154  if (dbid == name) {
155  type = dbid_definition[m].type;
156  arb_field = dbid_definition[m].arb_field;
157  break;
158  }
159  }
160  if (type == DBID_ILLEGAL) throw GBS_global_string("Unknown DBID '%s'", dbid.c_str());
161 
162  string id = content.substr(id_start, dbid_start ? dbid_start->pos()-id_start : string::npos);
163  if (id.empty()) throw GBS_global_string("Empty database reference for '%s'", dbid.c_str());
164  if (id[id.length()-1] != '.') throw GBS_global_string("Expected terminal '.' in '%s'", id.c_str());
165  id.erase(id.length()-1); // remove terminal '.'
166  add(arb_field, id);
167  }
168  }
169 }
170 
171 // --------------------------------------------------------------------------------
172 
173 void MetaInfo::add(const MetaTag *meta, const string& content, bool allow_multiple_entries) {
174  stringMapIter existing = entries.find(meta->field);
175  if (existing != entries.end()) { // existing entry
176  if (!allow_multiple_entries) {
177  throw GBS_global_string("Multiple occurrences of tag '%s'", meta->tag.c_str());
178  }
179  existing->second += '\n'+content; // append content
180  }
181  else { // non-existing content
182  entries[meta->field] = content;
183  }
184 }
185 
186 const string& MetaInfo::getAccessionNumber() const {
187  stringMapCIter found = entries.find("acc");
188  if (found == entries.end()) {
189  static string no_acc("<Missing accession number>");
190  return no_acc;
191  }
192  return found->second;
193 }
194 
195 #if defined(DEBUG)
196 void MetaInfo::dump() const {
197  stringMapCIter e = entries.end();
198 
199  printf("MetaInfo:\n");
200  for (stringMapCIter i = entries.begin(); i != e; ++i) {
201  printf("%s='%s'\n", i->first.c_str(), i->second.c_str());
202  }
203 }
204 #endif // DEBUG
const RegMatch * subexpr_match(size_t subnr) const
Definition: RegExpr.cxx:97
void getKeys(stringSet &keys) const
Definition: MetaInfo.cxx:53
GB_TYPES type
DBID_TYPE
Definition: MetaInfo.cxx:100
size_t posBehindMatch() const
Definition: RegExpr.hxx:52
DEFINE_ITERATORS(RefVector)
void add(const MetaTag *meta, const std::string &content, bool allow_multiple_entries)
Definition: MetaInfo.cxx:173
#define gi_assert(cond)
Definition: defs.h:26
void add(int v)
Definition: ClustalV.cxx:461
void add_dbid(const std::string &content)
Definition: MetaInfo.cxx:122
GB_ERROR get_error() const
Definition: RegExpr.hxx:79
const char * GBS_global_string(const char *templat,...)
Definition: arb_msg.cxx:204
size_t pos() const
Definition: RegExpr.hxx:49
vector< Reference > RefVector
Definition: MetaInfo.cxx:43
STL namespace.
std::string tag
Definition: MetaTag.h:33
const std::string & getAccessionNumber() const
Definition: MetaInfo.cxx:186
void add(const std::string &field, const std::string &content)
Definition: MetaInfo.cxx:18
const RegMatch * match(const std::string &versus, size_t offset=0) const
Definition: RegExpr.cxx:80
std::string field
Definition: MetaTag.h:34
static const DBID dbid_definition[]
Definition: MetaInfo.cxx:113
#define re_assert(cond)
Definition: refentries.h:38
const std::string * has_failed() const
Definition: RegExpr.hxx:78
const char * id
Definition: MetaInfo.cxx:106
const std::string * get(const std::string &field) const
Definition: MetaInfo.cxx:29
DBID_TYPE type
Definition: MetaInfo.cxx:107
void getKeys(stringSet &keys) const
Definition: MetaInfo.cxx:34
#define NULp
Definition: cxxforward.h:97
std::set< std::string > stringSet
Definition: types.h:28
std::string extract(const std::string &s) const
Definition: RegExpr.hxx:54
size_t subexpr_count() const
Definition: RegExpr.cxx:89
size_t length
void start()
Definition: MetaInfo.cxx:46
const char * arb_field
Definition: MetaInfo.cxx:108
std::string tagged_content(const std::string &refkey) const
Definition: MetaInfo.cxx:61