ARB
NT_dbrepair.cxx
Go to the documentation of this file.
1 // =============================================================== //
2 // //
3 // File : NT_dbrepair.cxx //
4 // Purpose : repair database bugs //
5 // //
6 // Coded by Ralf Westram (coder@reallysoft.de) in May 2008 //
7 // Institute of Microbiology (Technical University Munich) //
8 // http://www.arb-home.de/ //
9 // //
10 // =============================================================== //
11 
12 #include "NT_local.h"
13 
14 #include <arbdbt.h>
15 #include <adGene.h>
16 
17 #include <items.h>
18 #include <GEN.hxx>
19 #include <EXP.hxx>
20 #include <aw_msg.hxx>
21 #include <arb_progress.h>
22 #include <aw_question.hxx>
23 
24 #include <arb_str.h>
25 #include <arb_strarray.h>
26 
27 #include <map>
28 #include <set>
29 #include <string>
30 #include <vector>
31 #include <ad_colorset.h>
32 
33 using namespace std;
34 
35 #if defined(WARN_TODO)
36 #warning the whole fix mechanism should be part of some lower-level-library
37 // meanwhile DB checks are only performed by ARB_NTREE
38 // ItemSelector should go to same library as this module
39 #endif
40 
41 // --------------------------------------------------------------------------------
42 // CheckedConsistencies provides an easy way to automatically correct flues in the database
43 // by calling a check routine exactly once.
44 //
45 // For an example see nt_check_database_consistency()
46 //
47 // Note: this makes problems if DB is loaded with older ARB version and some already
48 // fixed flues a put into DB again.
49 // see http://bugs.arb-home.de/ticket/143
50 
51 typedef GB_ERROR (*item_check_fun)(GBDATA *gb_item, ItemSelector& sel);
52 
53 typedef map<string, item_check_fun> item_check_map;
54 typedef item_check_map::const_iterator item_check_iter;
55 
57  GBDATA *gb_main;
58  size_t species_count;
59  size_t sai_count;
60  set<string> consistencies;
61  item_check_map item_checks;
62 
63  GB_ERROR perform_selected_item_checks(ItemSelector& sel);
64 
65 public:
66 
67  CheckedConsistencies(GBDATA *gb_main_) : gb_main(gb_main_) {
68  GB_transaction ta(gb_main);
69  GBDATA *gb_checks = GB_search(gb_main, "checks", GB_CREATE_CONTAINER);
70 
71  for (GBDATA *gb_check = GB_entry(gb_checks, "check"); gb_check; gb_check = GB_nextEntry(gb_check)) {
72  consistencies.insert(GB_read_char_pntr(gb_check));
73  }
74 
75  species_count = GBT_get_species_count(gb_main);
76  sai_count = GBT_get_SAI_count(gb_main);
77  }
78 
79  bool was_performed(const string& check_name) const {
80  return consistencies.find(check_name) != consistencies.end();
81  }
82 
83  GB_ERROR register_as_performed(const string& check_name) {
85  if (was_performed(check_name)) {
86  printf("check '%s' already has been registered before. Duplicated check name?\n", check_name.c_str());
87  }
88  else {
89  GB_transaction ta(gb_main);
90 
91  GBDATA *gb_checks = GB_search(gb_main, "checks", GB_CREATE_CONTAINER);
92  GBDATA *gb_check = GB_create(gb_checks, "check", GB_STRING);
93 
94  if (!gb_check) error = GB_await_error();
95  else error = GB_write_string(gb_check, check_name.c_str());
96 
97  if (!error) consistencies.insert(check_name);
98  }
99  return error;
100  }
101 
102  void perform_check(const string& check_name,
103  GB_ERROR (*do_check)(GBDATA *gb_main, size_t species, size_t sais),
104  GB_ERROR& error)
105  {
106  if (!error && !was_performed(check_name)) {
107  arb_progress progress(check_name.c_str());
108  error = do_check(gb_main, species_count, sai_count);
109  if (!error) register_as_performed(check_name);
110  }
111  }
112 
113  void register_item_check(const string& check_name, item_check_fun item_check) {
114  if (!was_performed(check_name)) {
115  item_checks[check_name] = item_check;
116  }
117  }
118 
119  void perform_item_checks(GB_ERROR& error);
120 
122  GB_ERROR error = NULp;
123  GB_transaction ta(gb_main);
124 
125  GBDATA *gb_checks = GB_search(gb_main, "checks", GB_CREATE_CONTAINER);
126  for (GBDATA *gb_check = GB_entry(gb_checks, "check"); gb_check && !error; gb_check = GB_nextEntry(gb_check)) {
127  char *check_name = GB_read_string(gb_check);
128 
129 #if defined(DEBUG)
130  printf("Deleting check '%s'\n", check_name);
131 #endif // DEBUG
132  error = GB_delete(gb_check);
133  consistencies.erase(check_name);
134  free(check_name);
135  }
136  return error;
137  }
138 };
139 
140 GB_ERROR CheckedConsistencies::perform_selected_item_checks(ItemSelector& sel) {
141  GB_ERROR error = NULp;
142  item_check_iter end = item_checks.end();
143 
144  for (GBDATA *gb_cont = sel.get_first_item_container(gb_main, NULp, QUERY_ALL_ITEMS);
145  gb_cont && !error;
146  gb_cont = sel.get_next_item_container(gb_cont, QUERY_ALL_ITEMS))
147  {
148  for (GBDATA *gb_item = sel.get_first_item(gb_cont, QUERY_ALL_ITEMS);
149  gb_item && !error;
150  gb_item = sel.get_next_item(gb_item, QUERY_ALL_ITEMS))
151  {
152  for (item_check_iter chk = item_checks.begin(); chk != end && !error; ++chk) {
153  error = chk->second(gb_item, sel);
154  }
155  }
156  }
157 
158  return error;
159 }
160 
162  if (!item_checks.empty()) {
163  if (!error) {
164  GB_transaction ta(gb_main);
165  bool is_genome_db = GEN_is_genome_db(gb_main, -1);
166 
167  error = perform_selected_item_checks(SPECIES_get_selector());
168  if (!error && is_genome_db) {
169  error = perform_selected_item_checks(GEN_get_selector());
170  if (!error) error = perform_selected_item_checks(EXP_get_selector());
171  }
172 
173  error = ta.close(error);
174  }
175 
176  if (!error) {
177  item_check_iter end = item_checks.end();
178  for (item_check_iter chk = item_checks.begin(); chk != end && !error; ++chk) {
179  error = register_as_performed(chk->first);
180  }
181 
182  if (!error) item_checks.clear();
183  }
184  }
185 }
186 
187 // --------------------------------------------------------------------------------
188 
189 static GB_ERROR NT_fix_gene_data(GBDATA *gb_main, size_t species_count, size_t /* sai_count */) {
190  GB_transaction ta(gb_main);
191  arb_progress progress(species_count);
192 
193  size_t deleted_gene_datas = 0;
194  size_t generated_gene_datas = 0;
195  GB_ERROR error = NULp;
196 
197  for (GBDATA *gb_species = GBT_first_species(gb_main);
198  gb_species && !error;
199  gb_species = GBT_next_species(gb_species))
200  {
201  bool is_organism = (GB_entry(gb_species, GENOM_ALIGNMENT)); // same test as GEN_is_organism, but w/o genome-db-assertion
202  GBDATA *gb_gene_data = GEN_find_gene_data(gb_species);
203 
204  if (is_organism && !gb_gene_data) {
205  gb_gene_data = GEN_findOrCreate_gene_data(gb_species); // @@@ check result & handle error
206  generated_gene_datas++;
207  }
208  else if (!is_organism && gb_gene_data) {
209  GBDATA *gb_child = GB_child(gb_gene_data);
210  if (!gb_child) {
211  error = GB_delete(gb_gene_data);
212  if (!error) deleted_gene_datas++;
213  }
214  else {
215  error = GBS_global_string("Non-empty 'gene_data' found for species '%s',\n"
216  "which has no alignment '" GENOM_ALIGNMENT "',\n"
217  "i.e. which is not regarded as full-genome organism.\n"
218  "This causes problems - please fix!",
219  GBT_get_name_or_description(gb_species));
220  }
221  }
222 
223  progress.inc_and_check_user_abort(error);
224  }
225 
226  if (!error) {
227  if (deleted_gene_datas) {
228  aw_message(GBS_global_string("Deleted %zu useless empty 'gene_data' entries.", deleted_gene_datas));
229  }
230  if (generated_gene_datas) {
231  aw_message(GBS_global_string("Re-created %zu missing 'gene_data' entries.\nThese organisms have no genes yet!", generated_gene_datas));
232  }
233  }
234  return ta.close(error);
235 }
236 
237 // --------------------------------------------------------------------------------
238 
239 static GBDATA *expectField(GBDATA *gb_gene, const char *field, GB_ERROR& data_error) {
240  GBDATA *gb_field = NULp;
241  if (!data_error) {
242  gb_field = GB_entry(gb_gene, field);
243  if (!gb_field) data_error = GBS_global_string("Expected field '%s' missing", field);
244  }
245  return gb_field;
246 }
247 
248 static GBDATA *disexpectField(GBDATA *gb_gene, const char *field, GB_ERROR& data_error) {
249  GBDATA *gb_field = NULp;
250  if (!data_error) {
251  gb_field = GB_entry(gb_gene, field);
252  if (gb_field) data_error = GBS_global_string("Unexpected field '%s' exists (wrong value in pos_joined?)", field);
253  }
254  GBS_reuse_buffer(field);
255  return gb_field;
256 }
257 
258 static GB_ERROR NT_convert_gene_locations(GBDATA *gb_main, size_t species_count, size_t /* sai_count */) {
259  GB_transaction ta(gb_main);
260 
261  GB_ERROR error = NULp;
262  long fixed_genes = 0;
263  long skipped_genes = 0;
264  long genes = 0;
265 
266  typedef vector<GBDATA*> GBvec;
267  GBvec toDelete;
268 
269  arb_progress progress(species_count);
270 
271  for (GBDATA *gb_organism = GEN_first_organism(gb_main);
272  gb_organism && !error;
273  gb_organism = GEN_next_organism(gb_organism))
274  {
275  GBDATA *gb_gene_data = GEN_find_gene_data(gb_organism);
276  nt_assert(gb_gene_data);
277  if (gb_gene_data) {
278  for (GBDATA *gb_gene = GEN_first_gene_rel_gene_data(gb_gene_data);
279  gb_gene && !error;
280  gb_gene = GEN_next_gene(gb_gene))
281  {
282  genes++;
283 
284  int parts = 1;
285  {
286  GBDATA *gb_pos_joined = GB_entry(gb_gene, "pos_joined");
287  if (gb_pos_joined) parts = GB_read_int(gb_pos_joined); // its a joined gene
288  }
289 
290  GBDATA *gb_pos_start = GB_entry(gb_gene, "pos_start"); // test for new format
291  if (!gb_pos_start) {
292  GBDATA *gb_pos_begin = GB_entry(gb_gene, "pos_begin"); // test for old format
293  if (!gb_pos_begin) {
294  error = "Neither 'pos_begin' nor 'pos_start' found - format of gene location is unknown";
295  }
296  }
297 
298  if (!gb_pos_start && !error) { // assume old format
299  // parts<-1 would be valid in new format, but here we have old format
300  if (parts<1) error = GBS_global_string("Illegal value in 'pos_joined' (%i)", parts);
301 
302  GB_ERROR data_error = NULp; // error in this gene -> don't convert
303  GEN_position *pos = GEN_new_position(parts, false); // all were joinable (no information about it was stored)
304 
305  // parse old gene information into 'pos'
306  //
307  // old-format was:
308  // Start-Positions: pos_begin, pos_begin2, pos_begin3, ...
309  // End-Positions: pos_end, pos_end2, pos_end3, ...
310  // Joined?: pos_joined (always >= 1)
311  // Complement: complement (one entry for all parts)
312  // Certainty: pos_uncertain (maybe pos_uncertain1 etc.)
313 
314  int complement = 0;
315  {
316  GBDATA *gb_complement = GB_entry(gb_gene, "complement");
317  if (gb_complement) {
318  complement = GB_read_byte(gb_complement);
319  toDelete.push_back(gb_complement);
320  }
321  }
322 
323  bool has_uncertain_fields = false;
324  for (int p = 1; p <= parts && !error && !data_error; ++p) {
325  GBDATA *gb_pos_begin = NULp;
326  GBDATA *gb_pos_end = NULp;
327  const char *pos_uncertain_field = NULp;
328 
329  if (p == 1) {
330  gb_pos_begin = expectField(gb_gene, "pos_begin", data_error);
331  gb_pos_end = expectField(gb_gene, "pos_end", data_error);
332 
333  pos_uncertain_field = "pos_uncertain";
334  }
335  else {
336  const char *pos_begin_field = GBS_global_string("pos_begin%i", p);
337  const char *pos_end_field = GBS_global_string("pos_end%i", p);
338 
339  gb_pos_begin = expectField(gb_gene, pos_begin_field, data_error);
340  gb_pos_end = expectField(gb_gene, pos_end_field, data_error);
341 
342  GBS_reuse_buffer(pos_end_field);
343  GBS_reuse_buffer(pos_begin_field);
344 
345  if (!data_error) pos_uncertain_field = GBS_global_string("pos_uncertain%i", p);
346  }
347 
348  int pospos = complement ? (parts-p) : (p-1);
349 
350  if (!data_error) {
351  GBDATA *gb_pos_uncertain = GB_entry(gb_gene, pos_uncertain_field);
352 
353  if (!gb_pos_uncertain) {
354  if (has_uncertain_fields) data_error = GBS_global_string("Expected field '%s' missing", pos_uncertain_field);
355  }
356  else {
357  if (p == 1) has_uncertain_fields = true;
358  else {
359  if (!has_uncertain_fields) {
360  data_error = GBS_global_string("Found '%s' as first certainty-information", pos_uncertain_field);
361  }
362  }
363  }
364 
365  if (!data_error) {
366  int begin = GB_read_int(gb_pos_begin);
367  int end = GB_read_int(gb_pos_end);
368 
369  pos->start_pos[pospos] = begin;
370  pos->stop_pos[pospos] = end;
371  pos->complement[pospos] = complement; // set all complement entries to same value (old format only had one complement entry)
372 
373  if (gb_pos_uncertain) {
374  const char *uncertain = GB_read_char_pntr(gb_pos_uncertain);
375 
376  if (!uncertain) error = GB_await_error();
377  else {
378  if (!pos->start_uncertain) GEN_use_uncertainties(pos);
379 
380  if (strlen(uncertain) != 2) {
381  data_error = "wrong length";
382  }
383  else {
384  for (int up = 0; up<2; up++) {
385  if (!strchr("<=>", uncertain[up])) {
386  data_error = GBS_global_string("illegal character '%c'", uncertain[up]);
387  }
388  else {
389  (up == 0 ? pos->start_uncertain[pospos] : pos->stop_uncertain[pospos]) = uncertain[up];
390  }
391  }
392  }
393 
394 
395  toDelete.push_back(gb_pos_uncertain);
396  }
397  }
398 
399  toDelete.push_back(gb_pos_begin);
400  toDelete.push_back(gb_pos_end);
401  }
402  }
403  }
404 
405  for (int p = parts+1; p <= parts+4 && !error && !data_error; ++p) {
406  disexpectField(gb_gene, GBS_global_string("pos_begin%i", p), data_error);
407  disexpectField(gb_gene, GBS_global_string("pos_end%i", p), data_error);
408  disexpectField(gb_gene, GBS_global_string("complement%i", p), data_error);
409  disexpectField(gb_gene, GBS_global_string("pos_uncertain%i", p), data_error);
410  }
411 
412  // now save new position data
413 
414  if (data_error) {
415  skipped_genes++;
416  }
417  else if (!error) {
418  error = GEN_write_position(gb_gene, pos, 0);
419 
420  if (!error) {
421  // delete old-format entries
422  GBvec::const_iterator end = toDelete.end();
423  for (GBvec::const_iterator i = toDelete.begin(); i != end && !error; ++i) {
424  GBDATA *gb_del = *i;
425  error = GB_delete(gb_del);
426  }
427 
428  if (!error) fixed_genes++;
429  }
430  }
431 
432  toDelete.clear();
433  GEN_free_position(pos);
434 
435  if (data_error || error) {
436  char *gene_id = GEN_global_gene_identifier(gb_gene, gb_organism);
437  if (error) {
438  error = GBS_global_string("Gene '%s': %s", gene_id, error);
439  }
440  else {
441  aw_message(GBS_global_string("Gene '%s' was not converted, fix data manually!\nReason: %s", gene_id, data_error));
442  }
443  free(gene_id);
444  }
445  }
446  }
447  }
448 
449  progress.inc_and_check_user_abort(error);
450  }
451 
452  if (!error) {
453  if (fixed_genes>0) aw_message(GBS_global_string("Fixed location entries of %li genes.", fixed_genes));
454  if (skipped_genes>0) {
455  aw_message(GBS_global_string("Didn't fix location entries of %li genes (see warnings).", skipped_genes));
456  error = "Not all gene locations were fixed.\nFix manually, save DB and restart ARB with that DB.\nMake sure you have a backup copy of the original DB!";
457  }
458 
459  if (fixed_genes || skipped_genes) {
460  long already_fixed_genes = genes-(fixed_genes+skipped_genes);
461  if (already_fixed_genes>0) aw_message(GBS_global_string("Location entries of %li genes already were in new format.", already_fixed_genes));
462  }
463  }
464 
465  return error;
466 }
467 
468 
469 // --------------------------------------------------------------------------------
470 
471 static GB_ERROR NT_del_mark_move_REF(GBDATA *gb_main, size_t species_count, size_t sai_count) {
472  GB_transaction ta(gb_main);
473  GB_ERROR error = NULp;
474  size_t all = species_count+sai_count;
475  size_t removed = 0;
476 
477  // delete 'mark' entries from all alignments of species/SAIs
478 
479  arb_progress progress(all);
480  ConstStrArray ali_names;
481  GBT_get_alignment_names(ali_names, gb_main);
482 
483  for (int pass = 0; pass < 2 && !error; ++pass) {
484  for (GBDATA *gb_item = (pass == 0) ? GBT_first_species(gb_main) : GBT_first_SAI(gb_main);
485  gb_item && !error;
486  gb_item = (pass == 0) ? GBT_next_species(gb_item) : GBT_next_SAI(gb_item))
487  {
488  for (int ali = 0; ali_names[ali] && !error; ++ali) {
489  GBDATA *gb_ali = GB_entry(gb_item, ali_names[ali]);
490  if (gb_ali) {
491  GBDATA *gb_mark = GB_entry(gb_ali, "mark");
492  if (gb_mark) {
493  error = GB_delete(gb_mark);
494  removed++;
495  }
496  }
497  }
498 
499  progress.inc_and_check_user_abort(error);
500  }
501  }
502 
503  {
504  char *helix_name = GBT_get_default_helix(gb_main);
505  GBDATA *gb_helix = GBT_find_SAI(gb_main, helix_name);
506 
507  if (gb_helix) {
508  for (int ali = 0; ali_names[ali] && !error; ++ali) {
509  GBDATA *gb_ali = GB_entry(gb_helix, ali_names[ali]);
510  GBDATA *gb_old_ref = GB_entry(gb_ali, "REF");
511  GBDATA *gb_new_ref = GB_entry(gb_ali, "_REF");
512 
513  if (gb_old_ref) {
514  if (gb_new_ref) {
515  error = GBS_global_string("SAI:%s has 'REF' and '_REF' in '%s' (data corrupt?!)",
516  helix_name, ali_names[ali]);
517  }
518  else { // move info from REF -> _REF
519  char *content = GB_read_string(gb_old_ref);
520  if (!content) error = GB_await_error();
521  else {
522  gb_new_ref = GB_create(gb_ali, "_REF", GB_STRING);
523  if (!gb_new_ref) error = GB_await_error();
524  else {
525  error = GB_write_string(gb_new_ref, content);
526  if (!error) error = GB_delete(gb_old_ref);
527  }
528  free(content);
529  }
530  }
531  }
532  }
533  }
534 
535  free(helix_name);
536  }
537 
538  if (!error) {
539  if (removed) {
540  aw_message(GBS_global_string("Deleted %zu useless 'mark' entries.", removed));
541  }
542  }
543 
544  return ta.close(error);
545 }
546 
547 // --------------------------------------------------------------------------------
548 
549 static bool testDictionaryCompression(GBDATA *gbd, GBQUARK key_quark, bool testUse) {
550  // returns true, if
551  // testUse == true and ANY entries below 'gbd' with quark 'key_quark' uses dictionary compression
552  // testUse == false and ALL entries below 'gbd' with quark 'key_quark' can be decompressed w/o errors
553 
554  nt_assert(GB_read_type(gbd) == GB_DB);
555 
556  for (GBDATA *gb_sub = GB_child(gbd); gb_sub; gb_sub = GB_nextChild(gb_sub)) {
557  switch (GB_read_type(gb_sub)) {
558  case GB_DB:
559  // return false if any compression failed or return true if any uses dict-compression
560  if (testDictionaryCompression(gb_sub, key_quark, testUse) == testUse) return testUse;
561  break;
562 
563  case GB_STRING:
564  if (GB_get_quark(gb_sub) == key_quark && GB_is_dictionary_compressed(gb_sub)) {
565  if (testUse) return true;
566 
567  const char *decompressed = GB_read_char_pntr(gb_sub);
568  if (!decompressed) return false;
569  }
570  break;
571 
572  default:
573  break;
574  }
575  }
576 
577  return !testUse;
578 }
579 
580 class Dict;
581 typedef SmartPtr<Dict> DictPtr;
582 
583 
584 class KeyInfo : virtual Noncopyable {
585  string name; // keyname
586  DictPtr original;
587 
588  bool compressionTested;
589  bool compressed;
590 
591  void init() {
592  compressionTested = false;
593  compressed = false;
594  }
595 
596 public:
597  KeyInfo(const char *Name) : name(Name) { init(); }
598  KeyInfo(const char *Name, DictPtr originalDict) : name(Name), original(originalDict) { init(); }
599 
600  void testCompressed(GBDATA *gb_main) {
601  nt_assert(!compressionTested);
602  compressed = testDictionaryCompression(gb_main, GB_find_or_create_quark(gb_main, name.c_str()), true);
603  compressionTested = true;
604  }
605 
606  const string& getName() const { return name; }
607 
608  bool isCompressed() const {
609  nt_assert(compressionTested);
610  return compressed;
611  }
612 };
613 
614 
615 class Dict : virtual Noncopyable {
616  string group; // lowercase keyname
617  string orgkey;
618  DictData *data;
619 
620  map<string, bool> decompressWorks; // key -> bool
621 
622 public:
623  static GBDATA *gb_main;
624 
625  Dict(const char *Group, const char *OrgKey, DictData *Data) : group(Group), orgkey(OrgKey), data(Data) {}
626 
627  const string& getGroup() const { return group; }
628  const string& getOriginalKey() const { return orgkey; }
629 
630  bool mayBeUsedWith(const string& key) const { return strcasecmp(group.c_str(), key.c_str()) == 0; }
631 
632  GB_ERROR assignToKey(const string& key) const { return GB_set_dictionary(gb_main, key.c_str(), data); }
633  GB_ERROR unassignFromKey(const string& key) const { return GB_set_dictionary(gb_main, key.c_str(), NULp); }
634 
635  bool canDecompress(const string& key) {
636  nt_assert(mayBeUsedWith(key));
637  if (decompressWorks.find(key) == decompressWorks.end()) {
638  bool works = false;
639  GB_ERROR error = assignToKey(key);
640 
641  if (!error) works = testDictionaryCompression(gb_main, GB_find_or_create_quark(gb_main, key.c_str()), false);
642  decompressWorks[key] = works;
643 
644  GB_ERROR err2 = unassignFromKey(key);
645  if (err2) {
646  aw_message(GBS_global_string("Error while removing @dictionary from key '%s': %s", key.c_str(), err2));
647  }
648  }
649  return decompressWorks[key];
650  }
651 };
653 
654 
655 typedef map<string, int> KeyCounter; // groupname -> occur count
657 typedef map<string, KeyInfoPtr> Keys; // keyname -> info
658 typedef map<string, DictPtr> DictMap;
659 typedef vector<DictPtr> Dicts;
660 typedef set<string> StringSet;
661 
662 #define STATUS_PREFIX "Dictionary: "
663 
664 template<typename CONT, typename KEY>
665 bool contains(const CONT& container, const KEY& key) {
666  return container.find(key) != container.end();
667 }
668 
669 static GB_ERROR findAffectedKeys(GBDATA *gb_key_data, KeyCounter& kcount, Keys& keys, Dicts& dicts) {
670  GB_ERROR error = NULp;
671  GBDATA *gb_main = GB_get_root(gb_key_data);
672 
673  for (int pass = 1; pass <= 2; ++pass) {
674  for (GBDATA *gb_key = GB_entry(gb_key_data, "@key"); !error && gb_key; gb_key = GB_nextEntry(gb_key)) {
675  GBDATA *gb_name = GB_entry(gb_key, "@name");
676  const char *keyName = GB_read_char_pntr(gb_name);
677 
678  if (!keyName) {
679  error = GBS_global_string("@key w/o @name (%s)", GB_await_error());
680  }
681  else {
682  char *keyGroup = ARB_strdup(keyName);
683  ARB_strlower(keyGroup);
684 
685  switch (pass) {
686  case 1:
687  kcount[keyGroup]++;
688  break;
689  case 2:
690  if (kcount[keyGroup]>1) {
691  GBDATA *gb_dictionary = GB_entry(gb_key, "@dictionary");
692  if (gb_dictionary) {
693  DictPtr dict = new Dict(keyGroup, keyName, GB_get_dictionary(gb_main, keyName));
694  keys[keyName] = new KeyInfo(keyName, dict);
695  dicts.push_back(dict);
696  }
697  else keys[keyName] = new KeyInfo(keyName);
698  }
699  else kcount.erase(keyGroup);
700  break;
701  }
702  free(keyGroup);
703  }
704  }
705  }
706  return error;
707 }
708 
709 static GB_ERROR deleteDataOfKey(GBDATA *gbd, GBQUARK key_quark, StringSet& deletedData, long& deleted, long& notDeleted) {
710  GB_ERROR error = NULp;
711  for (GBDATA *gb_sub = GB_child(gbd); gb_sub; gb_sub = GB_nextChild(gb_sub)) {
712  switch (GB_read_type(gb_sub)) {
713  case GB_DB:
714  error = deleteDataOfKey(gb_sub, key_quark, deletedData, deleted, notDeleted);
715  break;
716 
717  case GB_STRING:
718  if (GB_get_quark(gb_sub) == key_quark) {
719  if (GB_is_dictionary_compressed(gb_sub)) {
720  string path(GB_get_db_path(gb_sub));
721  error = GB_delete(gb_sub);
722  if (!error) {
723  deletedData.insert(path);
724  deleted++;
725  }
726  }
727  else {
728  notDeleted++;
729  }
730  }
731  break;
732  default:
733  break;
734  }
735  }
736  return error;
737 }
738 
739 static char *readFirstCompressedDataOf(GBDATA *gbd, GBQUARK key_quark) {
740  char *data = NULp;
741  for (GBDATA *gb_sub = GB_child(gbd); !data && gb_sub; gb_sub = GB_nextChild(gb_sub)) {
742  switch (GB_read_type(gb_sub)) {
743  case GB_DB:
744  data = readFirstCompressedDataOf(gb_sub, key_quark);
745  break;
746 
747  case GB_STRING:
748  if (GB_get_quark(gb_sub) == key_quark) {
749  if (GB_is_dictionary_compressed(gb_sub)) {
750  data = GB_read_as_string(gb_sub);
751  }
752  }
753  break;
754  default:
755  break;
756  }
757  }
758  return data;
759 }
760 
761 
762 static GB_ERROR NT_fix_dict_compress(GBDATA *gb_main, size_t, size_t) {
763  GB_transaction ta(gb_main);
764  GBDATA *gb_key_data = GB_search(gb_main, GB_SYSTEM_FOLDER "/" GB_SYSTEM_KEY_DATA, GB_FIND);
765  GB_ERROR error = NULp;
766 
767  Dict::gb_main = gb_main;
768 
769  if (!gb_key_data) {
770  error = "No " GB_SYSTEM_KEY_DATA " found.. DB corrupted?";
771  }
772  else {
773  KeyCounter kcount; // strlwr(keyname) -> count
774  Keys keys;
775  Dicts dicts;
776 
777  error = findAffectedKeys(gb_key_data, kcount, keys, dicts);
778 
779  // count affected keys
780  long affectedKeys = 0;
781  for (KeyCounter::iterator kci = kcount.begin(); kci != kcount.end(); ++kci) {
782  affectedKeys += kci->second;
783  }
784 
785  if (!error && affectedKeys>0) {
786  // check which keys are compressed
787 
788  {
789  arb_progress progress(STATUS_PREFIX "search compressed data", affectedKeys);
790 
791  for (Keys::iterator ki = keys.begin(); ki != keys.end(); ++ki) {
792  KeyInfoPtr k = ki->second;
793  k->testCompressed(gb_main);
794  ++progress;
795  }
796  }
797 
798  // test which key/dict combinations work
799  long combinations = 0; // possible key/dict combinations
800 
801  DictMap use; // keyname -> dictionary (which dictionary to use)
802  StringSet multiDecompressible; // keys which can be decompressed with multiple dictionaries
803 
804  for (int pass = 1; pass <= 2; ++pass) {
805  arb_progress *progress = NULp;
806  if (pass == 2 && combinations) progress = new arb_progress(STATUS_PREFIX "test compression", combinations);
807 
808  for (Dicts::iterator di = dicts.begin(); di != dicts.end(); ++di) {
809  DictPtr d = *di;
810 
811  for (Keys::iterator ki = keys.begin(); ki != keys.end(); ++ki) {
812  KeyInfoPtr k = ki->second;
813  const string& keyname = k->getName();
814 
815  if (k->isCompressed() && d->mayBeUsedWith(keyname)) {
816  switch (pass) {
817  case 1:
818  combinations++;
819  break;
820  case 2:
821  if (d->canDecompress(keyname)) {
822  if (!contains(use, keyname)) { // first dictionary working with keyname
823  use[keyname] = d;
824  }
825  else { // already have another dictionary working with keyname
826  multiDecompressible.insert(keyname);
827  }
828  }
829  ++(*progress);
830  break;
831  }
832  }
833  }
834  }
835  delete progress;
836  }
837 
838  StringSet notDecompressible; // keys which can be decompressed with none of the dictionaries
839  for (Keys::iterator ki = keys.begin(); ki != keys.end(); ++ki) {
840  KeyInfoPtr k = ki->second;
841  const string& keyname = k->getName();
842 
843  if (k->isCompressed()) {
844  if (!contains(use, keyname)) notDecompressible.insert(keyname);
845  if (contains(multiDecompressible, keyname)) use.erase(keyname);
846  }
847  }
848 
849  bool dataLost = false;
850  int reassigned = 0;
851 
852  if (!notDecompressible.empty()) {
853  // bad .. found undecompressible data
854  long nd_count = notDecompressible.size();
855  aw_message(GBS_global_string("Detected corrupted dictionary compression\n"
856  "Data of %li DB-keys is lost and will be deleted", nd_count));
857 
858  arb_progress progress(STATUS_PREFIX "deleting corrupt data", nd_count);
859 
860  StringSet deletedData;
861  long deleted = 0;
862  long notDeleted = 0;
863 
864  for (StringSet::iterator ki = notDecompressible.begin(); !error && ki != notDecompressible.end(); ++ki) {
865  const string& keyname = *ki;
866 
867  error = deleteDataOfKey(gb_main, GB_find_or_create_quark(gb_main, keyname.c_str()), deletedData, deleted, notDeleted);
868  ++progress;
869  }
870 
871  if (!error) {
872  nt_assert(deleted); // at least 1 db-entry should have been deleted
873 
874  aw_message(GBS_global_string("Deleted %li of %li affected DB-entries", deleted, deleted+notDeleted));
875  aw_message("see console for a list of affected keys");
876 
877  printf("Deleted keys:\n");
878  for (StringSet::iterator di = deletedData.begin(); di != deletedData.end(); ++di) {
879  printf("* %s\n", di->c_str());
880  }
881  }
882  }
883 
884  if (!error && !multiDecompressible.empty()) {
885  for (StringSet::iterator ki = multiDecompressible.begin(); !error && ki != multiDecompressible.end(); ++ki) {
886  const string& keyname = *ki;
887  int possible = 0;
888  vector<DictPtr> possibleDicts;
889 
890  printf("--------------------------------------------------------------------------------\n");
891 
892  for (Dicts::iterator di = dicts.begin(); !error && di != dicts.end(); ++di) {
893  DictPtr d = *di;
894  if (d->mayBeUsedWith(keyname) && d->canDecompress(keyname)) {
895  error = d->assignToKey(keyname);
896  if (!error) {
897  char *data = readFirstCompressedDataOf(gb_main, GB_find_or_create_quark(gb_main, keyname.c_str()));
898 
899  nt_assert(data);
900  possible++;
901  printf("possibility %i = '%s'\n", possible, data);
902  free(data);
903 
904  possibleDicts.push_back(d);
905 
906  error = d->unassignFromKey(keyname);
907  }
908  }
909  }
910 
911  if (!error) {
912  nt_assert(possible>0);
913 
914  int selected;
915  if (possible>1) {
916  char *question = GBS_global_string_copy("%i possibilities to decompress field '%s' have been detected\n"
917  "and example data was dumped to the console.\n"
918  "Please examine output and decide which is the correct possibility!",
919  possible, keyname.c_str());
920 
921  const char *buttons = "Abort";
922  for (int p = 1; p <= possible; ++p) buttons = GBS_global_string("%s,%i", buttons, p);
923  selected = aw_question("dict_decompress_bug", question, buttons, false, NULp);
924  free(question);
925  }
926  else {
927  selected = 1;
928  }
929 
930  if (!selected) {
931  error = "Aborted by user";
932  }
933  else {
934  use[keyname] = possibleDicts[selected-1];
935  }
936  }
937  }
938  }
939 
940  // now all redundancies should be eliminated and we can assign dictionaries to affected keys
941  if (!error) {
942  for (Keys::iterator ki = keys.begin(); !error && ki != keys.end(); ++ki) {
943  KeyInfoPtr k = ki->second;
944  const string& keyname = k->getName();
945 
946  if (k->isCompressed()) {
947  if (!contains(use, keyname)) {
948  error = GBS_global_string("No dictionary detected for key '%s'", keyname.c_str());
949  }
950  else {
951  DictPtr d = use[keyname];
952 
953  if (d->getOriginalKey() != keyname) {
954  d->assignToKey(keyname); // set the dictionary
955  aw_message(GBS_global_string("Assigning '%s'-dictionary to '%s'",
956  d->getOriginalKey().c_str(), keyname.c_str()));
957  reassigned++;
958  }
959  }
960  }
961  }
962  }
963 
964  if (dataLost||reassigned) {
965  aw_message(dataLost
966  ? "We apologize for the data-loss."
967  : "No conflicts detected in compressed data.");
968  aw_message("Dictionaries fixed.\n"
969  "Please save your database with a new name.");
970  }
971  }
972  }
973 
974  Dict::gb_main = NULp;
975  return ta.close(error);
976 }
977 
978 // --------------------------------------------------------------------------------
979 
981  // Databases out there may contain multiple 'ARB_color' entries.
982  // Due to some already fixed bug - maybe introduced in r5309 and fixed in r5825
983 
984  GBDATA *gb_color = GB_entry(gb_item, GB_COLORGROUP_ENTRY);
985  GB_ERROR error = NULp;
986 
987 #if defined(DEBUG)
988  int del_count = 0;
989 #endif // DEBUG
990 
991  if (gb_color) {
992  GB_topSecurityLevel unsecured(gb_color);
993  while (!error) {
994  GBDATA *gb_next_color = GB_nextEntry(gb_color);
995  if (!gb_next_color) break;
996 
997  error = GB_delete(gb_next_color);
998 #if defined(DEBUG)
999  if (!error) del_count++;
1000 #endif // DEBUG
1001  }
1002  }
1003 
1004 #if defined(DEBUG)
1005  if (del_count) fprintf(stderr,
1006  "- deleted %i duplicated '" GB_COLORGROUP_ENTRY "' from %s '%s'\n",
1007  del_count,
1008  sel.item_name,
1009  sel.generate_item_id(GB_get_root(gb_item), gb_item));
1010 #endif // DEBUG
1011 
1012  return error;
1013 }
1014 
1015 // --------------------------------------------------------------------------------
1016 
1018  // status is already open and will be closed by caller!
1019 
1020  CheckedConsistencies check(gb_main);
1021  GB_ERROR err = NULp;
1022  bool is_genome_db;
1023  {
1024  GB_transaction ta(gb_main);
1025  is_genome_db = GEN_is_genome_db(gb_main, -1);
1026  }
1027 
1028  check.perform_check("fix gene_data", NT_fix_gene_data, err);
1029  check.perform_check("fix_dict_compress", NT_fix_dict_compress, err); // do this before NT_del_mark_move_REF (cause 'REF' is affected)
1030  check.perform_check("del_mark_move_REF", NT_del_mark_move_REF, err);
1031 
1032  if (is_genome_db) {
1033  check.perform_check("convert_gene_locations", NT_convert_gene_locations, err);
1034  }
1035 
1036  check.register_item_check("duplicated_item_colors", remove_dup_colors);
1037  check.perform_item_checks(err);
1038 
1039  return err;
1040 }
1041 
1042 void NT_rerepair_DB(AW_window*, GBDATA *gb_main) {
1043  // re-perform all DB checks
1044  GB_ERROR err = NULp;
1045  {
1046  CheckedConsistencies check(gb_main);
1047  err = check.forgetDoneChecks();
1048  }
1049  if (!err) {
1050  arb_progress progress("DB-Repair");
1051  err = NT_repair_DB(gb_main);
1052  }
1053 
1054  if (err) aw_message(err);
1055 }
1056 
1057 
ItemSelector & GEN_get_selector()
const char * GB_ERROR
Definition: arb_core.h:25
#define GB_SYSTEM_FOLDER
Definition: arbdb.h:27
GBDATA * GBT_first_SAI(GBDATA *gb_main)
Definition: aditem.cxx:162
const char * item_name
Definition: items.h:65
void testCompressed(GBDATA *gb_main)
unsigned char * complement
Definition: adGene.h:41
group_matcher all()
Definition: test_unit.h:1000
GBDATA * GEN_next_gene(GBDATA *gb_gene)
Definition: adGene.cxx:138
long GB_read_int(GBDATA *gbd)
Definition: arbdb.cxx:723
bool canDecompress(const string &key)
GBDATA * GB_child(GBDATA *father)
Definition: adquery.cxx:322
KeyInfo(const char *Name, DictPtr originalDict)
GB_ERROR GB_write_string(GBDATA *gbd, const char *s)
Definition: arbdb.cxx:1385
void GEN_free_position(GEN_position *pos)
Definition: adGene.cxx:195
static GBDATA * gb_main
static GB_ERROR NT_convert_gene_locations(GBDATA *gb_main, size_t species_count, size_t)
GBDATA * GB_nextEntry(GBDATA *entry)
Definition: adquery.cxx:339
GB_ERROR assignToKey(const string &key) const
GBDATA * GEN_findOrCreate_gene_data(GBDATA *gb_species)
Definition: adGene.cxx:44
long GBT_get_SAI_count(GBDATA *gb_main)
Definition: aditem.cxx:211
void GBT_get_alignment_names(ConstStrArray &names, GBDATA *gbd)
Definition: adali.cxx:316
static GB_ERROR deleteDataOfKey(GBDATA *gbd, GBQUARK key_quark, StringSet &deletedData, long &deleted, long &notDeleted)
char * ARB_strdup(const char *str)
Definition: arb_string.h:27
char * GB_read_as_string(GBDATA *gbd)
Definition: arbdb.cxx:1054
static GBDATA * disexpectField(GBDATA *gb_gene, const char *field, GB_ERROR &data_error)
const char * GBS_global_string(const char *templat,...)
Definition: arb_msg.cxx:204
STL namespace.
GB_ERROR unassignFromKey(const string &key) const
void GEN_use_uncertainties(GEN_position *pos)
Definition: adGene.cxx:184
static GBDATA * expectField(GBDATA *gb_gene, const char *field, GB_ERROR &data_error)
const string & getOriginalKey() const
set< string > StringSet
Dict(const char *Group, const char *OrgKey, DictData *Data)
void register_item_check(const string &check_name, item_check_fun item_check)
static GB_ERROR NT_fix_gene_data(GBDATA *gb_main, size_t species_count, size_t)
map< string, item_check_fun > item_check_map
Definition: NT_dbrepair.cxx:53
ItemSelector & EXP_get_selector()
GB_ERROR GB_delete(GBDATA *&source)
Definition: arbdb.cxx:1904
static GB_ERROR remove_dup_colors(GBDATA *gb_item, ItemSelector &IF_DEBUG(sel))
GBDATA * GBT_find_SAI(GBDATA *gb_main, const char *name)
Definition: aditem.cxx:177
GB_ERROR GB_await_error()
Definition: arb_msg.cxx:353
Definition: arbdb.h:78
GB_TYPES GB_read_type(GBDATA *gbd)
Definition: arbdb.cxx:1641
SmartPtr< KeyInfo > KeyInfoPtr
GBDATA * GB_create(GBDATA *father, const char *key, GB_TYPES type)
Definition: arbdb.cxx:1779
bool isCompressed() const
KeyInfo(const char *Name)
GB_ERROR register_as_performed(const string &check_name)
Definition: NT_dbrepair.cxx:83
unsigned char * start_uncertain
Definition: adGene.h:53
static int group[MAXN+1]
Definition: ClustalV.cxx:65
void perform_check(const string &check_name, GB_ERROR(*do_check)(GBDATA *gb_main, size_t species, size_t sais), GB_ERROR &error)
size_t * stop_pos
Definition: adGene.h:40
map< string, KeyInfoPtr > Keys
GBDATA *(* get_first_item_container)(GBDATA *, AW_root *, QUERY_RANGE)
Definition: items.h:69
void NT_rerepair_DB(AW_window *, GBDATA *gb_main)
#define GB_COLORGROUP_ENTRY
Definition: ad_colorset.h:21
static void error(const char *msg)
Definition: mkptypes.cxx:96
GBDATA * GB_get_root(GBDATA *gbd)
Definition: arbdb.cxx:1738
GBDATA *(* get_next_item_container)(GBDATA *, QUERY_RANGE)
Definition: items.h:70
#define STATUS_PREFIX
char * ARB_strlower(char *s)
Definition: arb_str.h:64
GBQUARK GB_find_or_create_quark(GBDATA *gbd, const char *key)
Definition: arbdb.cxx:1693
static GB_ERROR NT_fix_dict_compress(GBDATA *gb_main, size_t, size_t)
const string & getName() const
GBDATA * GEN_find_gene_data(GBDATA *gb_species)
Definition: adGene.cxx:50
GB_ERROR GB_set_dictionary(GBDATA *gb_main, const char *key, const DictData *dd)
Definition: adsystem.cxx:295
GBDATA * GEN_next_organism(GBDATA *gb_organism)
Definition: adGene.cxx:755
const string & getGroup() const
GB_ERROR GEN_write_position(GBDATA *gb_gene, const GEN_position *pos, long seqLength)
Definition: adGene.cxx:325
GBQUARK GB_get_quark(GBDATA *gbd)
Definition: arbdb.cxx:1701
char *(* generate_item_id)(GBDATA *gb_main, GBDATA *gb_item)
Definition: items.h:58
Definition: arbdb.h:86
void GBS_reuse_buffer(const char *global_buffer)
Definition: arb_msg.cxx:510
GBDATA *(* get_first_item)(GBDATA *, QUERY_RANGE)
Definition: items.h:72
#define GENOM_ALIGNMENT
Definition: adGene.h:19
CheckedConsistencies(GBDATA *gb_main_)
Definition: NT_dbrepair.cxx:67
static char * readFirstCompressedDataOf(GBDATA *gbd, GBQUARK key_quark)
GB_ERROR(* item_check_fun)(GBDATA *gb_item, ItemSelector &sel)
Definition: NT_dbrepair.cxx:51
static GB_ERROR findAffectedKeys(GBDATA *gb_key_data, KeyCounter &kcount, Keys &keys, Dicts &dicts)
vector< DictPtr > Dicts
#define nt_assert(cond)
Definition: NT_local.h:27
map< string, DictPtr > DictMap
map< string, int > KeyCounter
bool was_performed(const string &check_name) const
Definition: NT_dbrepair.cxx:79
GEN_position * GEN_new_position(int parts, bool joinable)
Definition: adGene.cxx:155
GBDATA *(* get_next_item)(GBDATA *, QUERY_RANGE)
Definition: items.h:73
int aw_question(const char *unique_id, const char *question, const char *buttons, bool sameSizeButtons, const char *helpfile)
Definition: AW_question.cxx:26
GBDATA * GBT_next_SAI(GBDATA *gb_sai)
Definition: aditem.cxx:166
char * GEN_global_gene_identifier(GBDATA *gb_gene, GBDATA *gb_organism)
Definition: adGene.cxx:793
GB_ERROR close(GB_ERROR error)
Definition: arbdbpp.cxx:32
bool mayBeUsedWith(const string &key) const
int GB_read_byte(GBDATA *gbd)
Definition: arbdb.cxx:728
char * GB_read_string(GBDATA *gbd)
Definition: arbdb.cxx:903
GB_ERROR forgetDoneChecks()
ItemSelector & SPECIES_get_selector()
Definition: species.cxx:139
GBDATA * GBT_first_species(GBDATA *gb_main)
Definition: aditem.cxx:124
static ARB_init_perl_interface init
Definition: ARB_ext.c:101
const char * GB_get_db_path(GBDATA *gbd)
Definition: adTest.cxx:14
void aw_message(const char *msg)
Definition: AW_status.cxx:932
size_t * start_pos
Definition: adGene.h:39
GB_ERROR NT_repair_DB(GBDATA *gb_main)
char * GBT_get_default_helix(GBDATA *)
Definition: adtools.cxx:64
GBDATA * GBT_next_species(GBDATA *gb_species)
Definition: aditem.cxx:128
#define NULp
Definition: cxxforward.h:97
#define GB_SYSTEM_KEY_DATA
Definition: arbdb.h:28
static GB_ERROR NT_del_mark_move_REF(GBDATA *gb_main, size_t species_count, size_t sai_count)
bool contains(const CONT &container, const KEY &key)
SmartPtr< Dict > DictPtr
GBDATA * GB_nextChild(GBDATA *child)
Definition: adquery.cxx:326
static bool testDictionaryCompression(GBDATA *gbd, GBQUARK key_quark, bool testUse)
bool GB_is_dictionary_compressed(GBDATA *gbd)
Definition: adcompr.cxx:869
long GBT_get_species_count(GBDATA *gb_main)
Definition: aditem.cxx:207
GB_transaction ta(gb_var)
GB_CSTR GB_read_char_pntr(GBDATA *gbd)
Definition: arbdb.cxx:898
GBDATA * gb_main
Definition: adname.cxx:33
void perform_item_checks(GB_ERROR &error)
item_check_map::const_iterator item_check_iter
Definition: NT_dbrepair.cxx:54
#define IF_DEBUG(x)
Definition: arb_assert.h:303
GBDATA * GB_search(GBDATA *gbd, const char *fieldpath, GB_TYPES create)
Definition: adquery.cxx:531
GBDATA * GEN_first_organism(GBDATA *gb_main)
Definition: adGene.cxx:749
DictData * GB_get_dictionary(GBDATA *gb_main, const char *key)
Definition: adsystem.cxx:279
GB_CSTR GBT_get_name_or_description(GBDATA *gb_item)
Definition: aditem.cxx:441
bool GEN_is_genome_db(GBDATA *gb_main, int default_value)
Definition: adGene.cxx:20
GBDATA * GEN_first_gene_rel_gene_data(GBDATA *gb_gene_data)
Definition: adGene.cxx:134
GBDATA * GB_entry(GBDATA *father, const char *key)
Definition: adquery.cxx:334
void inc_and_check_user_abort(GB_ERROR &error)
Definition: arb_progress.h:274
Definition: Group.hxx:20
char * GBS_global_string_copy(const char *templat,...)
Definition: arb_msg.cxx:195
unsigned char * stop_uncertain
Definition: adGene.h:54