ARB
DBwriter.cxx
Go to the documentation of this file.
1 // ================================================================ //
2 // //
3 // File : DBwriter.cxx //
4 // Purpose : //
5 // //
6 // Coded by Ralf Westram (coder@reallysoft.de) in November 2006 //
7 // Institute of Microbiology (Technical University Munich) //
8 // http://www.arb-home.de/ //
9 // //
10 // ================================================================ //
11 
12 #include "DBwriter.h"
13 
14 #define AW_RENAME_SKIP_GUI
15 
16 #include <algorithm>
17 #include <AW_rename.hxx>
18 #include <Translate.hxx>
19 #include <aw_question.hxx>
20 #include <GEN.hxx>
21 #include <adGene.h>
22 #include <arb_stdstr.h>
23 
24 #define ARB_GENE_REF "ARB_is_gene"
25 
26 using namespace std;
27 
28 typedef SmartCustomPtr(GEN_position, GEN_free_position) GEN_positionPtr;
29 
30 // --------------------------------------------------------------------------------
31 
32 void DBerror::init(const string& msg, GB_ERROR gberror) {
33  gi_assert(gberror);
34  if (gberror) err = msg+" (Reason: "+gberror+")";
35  else err = msg; // ndebug!
36 }
37 
39 DBerror::DBerror(const char *msg) { string errmsg(msg); init(errmsg, GB_await_error()); }
40 DBerror::DBerror(const string& msg) { init(msg, GB_await_error()); }
41 DBerror::DBerror(const char *msg, GB_ERROR gberror) { string errmsg(msg); init(errmsg, gberror); }
42 DBerror::DBerror(const string& msg, GB_ERROR gberror) { init(msg, gberror); }
43 
44 // --------------------------------------------------------------------------------
45 // DB access functions, throwing DBerror on failure
46 
47 static GBDATA *DB_create_container(GBDATA *parent, const char *name, bool mark) {
48  // create container (optionally set mark flag)
49  GBDATA *gb_container = GB_create_container(parent, name);
50  if (!gb_container) throw DBerror(GBS_global_string("Failed to create container '%s'", name));
51 
52  if (mark) GB_write_flag(gb_container, 1);
53 
54  return gb_container;
55 }
56 
57 static GBDATA *DB_create_string_field(GBDATA *parent, const char *field, const char *content) {
58  // create field with content
59 
60  gi_assert(content[0]);
61  // do NOT WRITE empty string-fields into ARB DB,
62  // cause ARB DB does not differ between empty content and non-existing fields
63  // (i.e. when writing an empty string, ARB removes the field)
64 
65  GBDATA *gb_field = GB_create(parent, field, GB_STRING);
66  if (!gb_field) throw DBerror(GBS_global_string("Failed to create field '%s'", field));
67 
68  GB_ERROR err = GB_write_string(gb_field, content);
69  if (err) throw DBerror(GBS_global_string("Failed to write to field '%s'", field), err);
70 
71  return gb_field;
72 }
73 
74 static GBDATA *DB_create_byte_field(GBDATA *parent, const char *field, unsigned char content) {
75  // create field with content
76  GBDATA *gb_field = GB_create(parent, field, GB_BYTE);
77  if (!gb_field) throw DBerror(GBS_global_string("Failed to create field '%s'", field));
78 
79  GB_ERROR err = GB_write_byte(gb_field, content);
80  if (err) throw DBerror(GBS_global_string("Failed to write to field '%s'", field), err);
81 
82  return gb_field;
83 }
84 
85 // --------------------------------------------------------------------------------
86 
87 void DBwriter::createOrganism(const string& flatfile, const char *importerTag) {
88  gi_assert(!gb_organism && !gb_gene_data);
89 
90  // create the organism
91  {
92  UniqueNameDetector& UND_species = *session.und_species;
93 
94  char *organism_name = AWTC_makeUniqueShortName("genome", UND_species);
95  if (!organism_name) throw DBerror();
96 
97  gb_organism = DB_create_container(session.gb_species_data, "species", true);
98  DB_create_string_field(gb_organism, "name", organism_name);
99 
100  UND_species.add_name(organism_name);
101  free(organism_name);
102  }
103 
104  // store info about source
105  DB_create_string_field(gb_organism, "ARB_imported_from", flatfile.c_str());
106  DB_create_string_field(gb_organism, "ARB_imported_format", importerTag);
107 }
108 
109 typedef map<string, string, NoCaseCmp> TranslateMap;
110 struct Translator { TranslateMap trans; };
111 
112 Translator *DBwriter::unreserve = NULp;
113 
114 const string& DBwriter::getUnreservedQualifier(const string& qualifier) {
115  // return a non-reserved qualifier
116  // (some are reserved - e.g. name, pos_start, ... and all starting with 'ARB_')
117  // if a qualifier is reserved, 'ORG_' is prepended.
118  //
119  // (Note: When we'll export data, 'ORG_' shall be removed from qualifiers!)
120 
121  static string prefix = "ORG_";
122 
123  if (!unreserve) {
124  unreserve = new Translator;
125  const char *reserved[] = {
126  "name", "type",
127  "pos_start", "pos_stop", "pos_complement", "pos_certain", "pos_joined",
128  NULp
129  };
130 
131  for (int i = 0; reserved[i]; ++i) {
132  unreserve->trans[reserved[i]] = prefix+reserved[i];
133  }
134  }
135 
136  TranslateMap::const_iterator found = unreserve->trans.find(qualifier);
137  if (found != unreserve->trans.end()) { // qualifier is reserved
138  return found->second;
139  }
140  if (NoCaseCmp::has_prefix(qualifier, prefix) || // qualifier starts with 'ORG_'
141  NoCaseCmp::has_prefix(qualifier, "ARB_")) // or 'ARB_'
142  {
143  unreserve->trans[qualifier] = prefix+qualifier; // add as 'ORG_ORG_' or 'ORG_ARB_' to TranslateMap
144  return unreserve->trans[qualifier];
145  }
146  return qualifier;
147 }
148 
149 void DBwriter::writeFeature(const Feature& feature, long seqLength) {
150  gi_assert(gb_organism);
151  if (!gb_gene_data) {
152  gb_gene_data = DB_create_container(gb_organism, "gene_data", false);
153  generatedGenes.clear();
154  }
155 
156  // create new gene
157  GBDATA *gb_gene;
158  {
159  string gene_name = feature.createGeneName();
160  NameCounter::iterator existing = generatedGenes.find(gene_name);
161  if (existing == generatedGenes.end()) { // first occurrence of that gene name
162  generatedGenes[gene_name] = 1;
163  }
164  else {
165  existing->second++; // increment occurrences
166  }
167 
168  gb_gene = DB_create_container(gb_gene_data, "gene", false);
169  DB_create_string_field(gb_gene, "name", gene_name.c_str());
170 
171  string type = feature.getType();
172  DB_create_string_field(gb_gene, "type", type.c_str());
173 
174  if (type == "source") {
175  DB_create_byte_field(gb_gene, ARB_HIDDEN, 1);
176  }
177  }
178 
179  // store location
180  {
181  GEN_positionPtr pos = feature.getLocation().create_GEN_position();
182  GB_ERROR err = GEN_write_position(gb_gene, &*pos, seqLength);
183  if (err) throw DBerror("Failed to write location", err);
184  }
185 
186  // store qualifiers
187  {
188  const stringMap& qualifiers = feature.getQualifiers();
189  stringMapCIter e = qualifiers.end();
190 
191  for (stringMapCIter i = qualifiers.begin(); i != e; ++i) {
192  const string& unreserved = getUnreservedQualifier(i->first);
193  DB_create_string_field(gb_gene, unreserved.c_str(), i->second.c_str());
194  }
195  }
196 }
197 
199  gi_assert(gb_organism);
200  GBDATA *gb_data = GBT_add_data(gb_organism, ali_name, "data", GB_STRING);
201  if (!gb_data) throw DBerror("Failed to create alignment");
202 
203  GB_ERROR err = GB_write_string(gb_data, seqData.getSequence());
204  if (err) throw DBerror("Failed to write alignment", err);
205 }
206 
208  NameCounter renameCounter; // for those genes which get renumbered, count upwards here
209  {
210  NameCounter::iterator gg_end = generatedGenes.end();
211  for (NameCounter::iterator gg = generatedGenes.begin(); gg != gg_end; ++gg) {
212  if (gg->second > 1) renameCounter[gg->first] = 0;
213  }
214  }
215  NameCounter::iterator rc_end = renameCounter.end();
216 
217  for (GBDATA *gb_gene = GB_entry(gb_gene_data, "gene");
218  gb_gene;
219  gb_gene = GB_nextEntry(gb_gene))
220  {
221  GBDATA *gb_name = GB_entry(gb_gene, "name");
222  string gene_name = GB_read_char_pntr(gb_name);
223 
224  NameCounter::iterator rc = renameCounter.find(gene_name);
225  if (rc != rc_end) { // rename current_gene
226  int maxOccurrences = generatedGenes[gene_name];
227  rc->second++; // increment occurrence counter
228  gi_assert(rc->second <= maxOccurrences);
229 
230  int digits = strlen(GBS_global_string("%i", maxOccurrences));
231  gene_name += GBS_global_string("_%0*i", digits, rc->second);
232  GB_ERROR err = GB_write_string(gb_name, gene_name.c_str());
233  if (err) throw DBerror("Failed to write to field 'name' (during gene-renumbering)", err);
234  }
235  }
236 }
237 
238 static void importerWarning(AW_CL cl_importer, const char *message) {
239  Importer *importer = reinterpret_cast<Importer*>(cl_importer);
240  importer->warning(message);
241 }
242 
243 void DBwriter::testAndRemoveTranslations(Importer& importer) {
244  GEN_testAndRemoveTranslations(gb_gene_data, importerWarning, reinterpret_cast<AW_CL>(&importer), session.ok_to_ignore_wrong_start_codon);
245 }
246 
247 // ----------------------------------------------
248 // hide duplicated genes (from genebank)
249 
250 inline bool operator<(const GEN_positionPtr& A, const GEN_positionPtr& B) {
251  const GEN_position& a = *A;
252  const GEN_position& b = *B;
253 
254  int cmp = int(a.start_pos[0]) - int(b.start_pos[0]);
255  if (!cmp) {
256  cmp = int(a.stop_pos[a.parts-1]) - int(b.stop_pos[b.parts-1]);
257  if (!cmp) {
258  cmp = int(a.parts) - int(b.parts); // less parts is <
259  if (!cmp) {
260  for (int p = 0; p<a.parts; ++p) { // compare all parts
261  cmp = int(b.complement[p]) - int(a.complement[p]); if (cmp) break; // non-complement is <
262  cmp = int(a.start_pos[p]) - int(b.start_pos[p]); if (cmp) break;
263  cmp = int(a.stop_pos[p]) - int(b.stop_pos[p]); if (cmp) break;
264  }
265  }
266  }
267  }
268 
269  return cmp<0;
270 }
271 
272 class PosGene {
273  RefPtr<GBDATA> gb_Gene;
274  mutable GEN_positionPtr pos;
275 
276 public:
277  PosGene(GBDATA *gb_gene) : gb_Gene(gb_gene) {
278  GEN_position *pp = GEN_read_position(gb_gene);
279  if (!pp) {
280  throw GBS_global_string("Can't read gene position (Reason: %s)", GB_await_error());
281  }
282  pos = pp;
283  }
284 
285  const GEN_positionPtr& getPosition() const { return pos; }
286 
287  const char *getName() const {
288  GBDATA *gb_name = GB_entry(gb_Gene, "name");
289  gi_assert(gb_name);
290  return GB_read_char_pntr(gb_name);
291  }
292  const char *getType() const {
293  GBDATA *gb_type = GB_entry(gb_Gene, "type");
294  gi_assert(gb_type);
295  return GB_read_char_pntr(gb_type);
296  }
297  bool hasType(const char *type) const { return strcmp(getType(), type) == 0; }
298 
299  void hide() { DB_create_byte_field(gb_Gene, ARB_HIDDEN, 1); }
300  void addRefToGene(const char *name_of_gene) { DB_create_string_field(gb_Gene, ARB_GENE_REF, name_of_gene); }
301 
302 #if defined(DEBUG)
303  const char *description() const {
304  return GBS_global_string("%zi-%zi (%i parts)", pos->start_pos[0], pos->stop_pos[pos->parts-1], pos->parts);
305  }
306  void dump() {
307  GB_dump(gb_Gene);
308  }
309 #endif // DEBUG
310 
311 };
312 
313 inline bool operator<(const PosGene& a, const PosGene& b) {
314  return a.getPosition() < b.getPosition();
315 }
316 inline bool operator == (const PosGene& a, const PosGene& b) {
317  return !(a<b || b<a);
318 }
319 
320 class hasType {
321  const char *type;
322 public:
323  hasType(const char *t) : type(t) {}
324  bool operator()(const PosGene& pg) { return pg.hasType(type); }
325 };
326 
327 void DBwriter::hideUnwantedGenes() {
328  typedef vector<PosGene> Genes;
329  typedef Genes::iterator GeneIter;
330 
331  Genes gps;
332 
333  // read all gene positions
334  for (GBDATA *gb_gene = GB_entry(gb_gene_data, "gene"); gb_gene; gb_gene = GB_nextEntry(gb_gene)) {
335  gps.push_back(PosGene(gb_gene));
336  }
337 
338  sort(gps.begin(), gps.end()); // sort positions
339 
340  // find duplicate geness (with identical location)
341  GeneIter end = gps.end();
342  GeneIter p = gps.begin();
343  GeneIter firstEqual = p;
344  GeneIter lastEqual = p;
345 
346  while (firstEqual != end) {
347  ++p;
348  if (p != end && *p == *firstEqual) {
349  lastEqual = p;
350  }
351  else { // found different element (or last)
352  int count = lastEqual-firstEqual+1;
353 
354  if (count>1) { // we have 2 or more duplicate genes
355  GeneIter equalEnd = lastEqual+1;
356  GeneIter gene = find_if(firstEqual, equalEnd, hasType("gene")); // locate 'type' == 'gene'
357 
358  if (gene != equalEnd) { // found type gene
359  bool hideGene = false;
360  GeneIter e = firstEqual;
361  const char *gene_name = gene->getName();
362 
363  while (e != equalEnd) {
364  if (e != gene) { // for all genes that have 'type' != 'gene'
365  e->addRefToGene(gene_name); // add a reference to the gene
366  hideGene = true; // and hide the gene
367  }
368  ++e;
369  }
370 
371  if (hideGene) gene->hide();
372  }
373  }
374  firstEqual = lastEqual = p;
375  }
376  }
377 }
378 
379 
380 void DBwriter::finalizeOrganism(const MetaInfo& meta, const References& refs, Importer& importer) {
381  // write metadata
382  {
383  const stringMap& entries = meta.getEntries();
384  stringMapCIter e = entries.end();
385 
386  for (stringMapCIter i = entries.begin(); i != e; ++i) {
387  const string& content = i->second;
388  if (!content.empty()) { // skip empty entries
389  DB_create_string_field(gb_organism, i->first.c_str(), content.c_str());
390  }
391  }
392  }
393 
394  // write references
395  {
396  stringSet refKeys;
397  refs.getKeys(refKeys);
398 
399  stringSetIter e = refKeys.end();
400  for (stringSetIter i = refKeys.begin(); i != e; ++i) {
401  DB_create_string_field(gb_organism, i->c_str(), refs.tagged_content(*i).c_str());
402  }
403  }
404 
405  // finalize genes data
406  if (gb_gene_data) {
407  renumberDuplicateGenes(); // renumber genes with equal names
408  testAndRemoveTranslations(importer); // test translations and remove reproducible translations
409  hideUnwantedGenes();
410  }
411  else GB_warning("No genes have been written (missing feature table?)");
412 
413  // cleanup
414  generatedGenes.clear();
415  gb_gene_data = NULp;
416  gb_organism = NULp;
417 }
418 
420  if (unreserve) {
421  delete unreserve;
422  unreserve = NULp;
423  }
424 }
425 
426 
427 
void getKeys(stringSet &keys) const
Definition: MetaInfo.cxx:53
GB_TYPES type
bool operator()(const PosGene &pg)
Definition: DBwriter.cxx:324
void GB_warning(const char *message)
Definition: arb_msg.cxx:530
unsigned char * complement
Definition: adGene.h:41
#define ARB_GENE_REF
Definition: DBwriter.cxx:24
const char * getName() const
Definition: DBwriter.cxx:287
void writeSequence(const SequenceBuffer &seqData)
Definition: DBwriter.cxx:198
Definition: arbdb.h:65
static void deleteStaticData()
Definition: DBwriter.cxx:419
void writeFeature(const Feature &feature, long seqLength)
Definition: DBwriter.cxx:149
#define gi_assert(cond)
Definition: defs.h:26
GB_ERROR GB_write_string(GBDATA *gbd, const char *s)
Definition: arbdb.cxx:1387
void GEN_free_position(GEN_position *pos)
Definition: adGene.cxx:195
#define ARB_HIDDEN
Definition: adGene.h:22
Definition: trnsprob.h:20
GBDATA * GB_nextEntry(GBDATA *entry)
Definition: adquery.cxx:339
std::map< std::string, int > NameCounter
Definition: DBwriter.h:50
int parts
Definition: adGene.h:37
static ArbImporter * importer
const char * GBS_global_string(const char *templat,...)
Definition: arb_msg.cxx:203
TranslateMap trans
Definition: DBwriter.cxx:110
static GBDATA * DB_create_byte_field(GBDATA *parent, const char *field, unsigned char content)
Definition: DBwriter.cxx:74
STL namespace.
PosGene(GBDATA *gb_gene)
Definition: DBwriter.cxx:277
const GEN_positionPtr & getPosition() const
Definition: DBwriter.cxx:285
void finalizeOrganism(const MetaInfo &meta, const References &refs, Importer &importer)
Definition: DBwriter.cxx:380
char * AWTC_makeUniqueShortName(const char *prefix, UniqueNameDetector &existingNames)
Definition: AW_rename.cxx:648
void warning(const char *msg)
Definition: Importer.cxx:128
NOT4PERL GBDATA * GBT_add_data(GBDATA *species, const char *ali_name, const char *key, GB_TYPES type) __ATTR__DEPRECATED_TODO("better use GBT_create_sequence_data()")
Definition: adali.cxx:597
GB_ERROR GB_await_error()
Definition: arb_msg.cxx:342
static void importerWarning(AW_CL cl_importer, const char *message)
Definition: DBwriter.cxx:238
GBDATA * GB_create_container(GBDATA *father, const char *key)
Definition: arbdb.cxx:1829
const stringMap & getQualifiers() const
Definition: Feature.h:32
GBDATA * GB_create(GBDATA *father, const char *key, GB_TYPES type)
Definition: arbdb.cxx:1781
DBerror()
Definition: DBwriter.cxx:38
CONSTEXPR_INLINE int digits(int parts)
void message(char *errortext)
size_t * stop_pos
Definition: adGene.h:40
void createOrganism(const string &flatfile, const char *importerTag)
Definition: DBwriter.cxx:87
#define cmp(h1, h2)
Definition: admap.cxx:50
static bool has_prefix(const std::string &s1, const std::string &s2)
Definition: arb_stdstr.h:46
GB_ERROR GEN_write_position(GBDATA *gb_gene, const GEN_position *pos, long seqLength)
Definition: adGene.cxx:325
GEN_position * create_GEN_position() const
Definition: Location.cxx:309
map< string, string, NoCaseCmp > TranslateMap
Definition: DBwriter.cxx:109
void hide()
Definition: DBwriter.cxx:299
std::string createGeneName() const
Definition: Feature.cxx:97
std::map< std::string, std::string > stringMap
Definition: types.h:29
const std::string & getType() const
Definition: Feature.h:30
hasType(const char *t)
Definition: DBwriter.cxx:323
void GB_write_flag(GBDATA *gbd, long flag)
Definition: arbdb.cxx:2773
long AW_CL
Definition: cb.h:21
void renumberDuplicateGenes()
Definition: DBwriter.cxx:207
void addRefToGene(const char *name_of_gene)
Definition: DBwriter.cxx:300
bool operator==(const PosGene &a, const PosGene &b)
Definition: DBwriter.cxx:316
GB_ERROR GB_write_byte(GBDATA *gbd, int i)
Definition: arbdb.cxx:1238
static GBDATA * DB_create_container(GBDATA *parent, const char *name, bool mark)
Definition: DBwriter.cxx:47
typedef SmartCustomPtr(GEN_position, GEN_free_position) GEN_positionPtr
static ARB_init_perl_interface init
Definition: ARB_ext.c:101
void add_name(const char *name)
Definition: AW_rename.hxx:58
size_t * start_pos
Definition: adGene.h:39
NOT4PERL void GB_dump(GBDATA *gbd)
Definition: adTest.cxx:187
#define NULp
Definition: cxxforward.h:116
static GBDATA * DB_create_string_field(GBDATA *parent, const char *field, const char *content)
Definition: DBwriter.cxx:57
const char * getSequence() const
std::set< std::string > stringSet
Definition: types.h:28
GB_CSTR GB_read_char_pntr(GBDATA *gbd)
Definition: arbdb.cxx:904
const Location & getLocation() const
Definition: Feature.h:31
bool hasType(const char *type) const
Definition: DBwriter.cxx:297
GEN_position * GEN_read_position(GBDATA *gb_gene)
Definition: adGene.cxx:250
GBDATA * GB_entry(GBDATA *father, const char *key)
Definition: adquery.cxx:334
GB_ERROR GEN_testAndRemoveTranslations(GBDATA *gb_gene_data, void(*warn)(AW_CL cd, const char *msg), AW_CL cd, AW_repeated_question *ok_to_ignore_wrong_start_codon)
const stringMap & getEntries() const
Definition: MetaInfo.h:70
const char * getType() const
Definition: DBwriter.cxx:292
std::string tagged_content(const std::string &refkey) const
Definition: MetaInfo.cxx:61
bool operator<(const GEN_positionPtr &A, const GEN_positionPtr &B)
Definition: DBwriter.cxx:250