ARB
NT_dbrepair.cxx
Go to the documentation of this file.
1 // =============================================================== //
2 // //
3 // File : NT_dbrepair.cxx //
4 // Purpose : repair database bugs //
5 // //
6 // Coded by Ralf Westram (coder@reallysoft.de) in May 2008 //
7 // Institute of Microbiology (Technical University Munich) //
8 // http://www.arb-home.de/ //
9 // //
10 // =============================================================== //
11 
12 #include "NT_local.h"
13 
14 #include <arbdbt.h>
15 #include <adGene.h>
16 
17 #include <items.h>
18 #include <GEN.hxx>
19 #include <EXP.hxx>
20 #include <aw_msg.hxx>
21 #include <arb_progress.h>
22 #include <aw_question.hxx>
23 
24 #include <arb_str.h>
25 #include <arb_strarray.h>
26 
27 #include <map>
28 #include <set>
29 #include <string>
30 #include <vector>
31 #include <ad_colorset.h>
32 
33 using namespace std;
34 
35 // @@@ the whole 'fix'-mechanism should be part of some lower-level-library
36 // meanwhile DB checks are only performed by ARB_NTREE
37 // ItemSelector should go to same library as this module
38 
39 // --------------------------------------------------------------------------------
40 // CheckedConsistencies provides an easy way to automatically correct flues in the database
41 // by calling a check routine exactly once.
42 //
43 // For an example see nt_check_database_consistency()
44 //
45 // Note: this makes problems if DB is loaded with older ARB version and some already
46 // fixed flues a put into DB again.
47 // see http://bugs.arb-home.de/ticket/143
48 
49 typedef GB_ERROR (*item_check_fun)(GBDATA *gb_item, ItemSelector& sel);
50 
51 typedef map<string, item_check_fun> item_check_map;
52 typedef item_check_map::const_iterator item_check_iter;
53 
55  GBDATA *gb_main;
56  size_t species_count;
57  size_t sai_count;
58  set<string> consistencies;
59  item_check_map item_checks;
60 
61  GB_ERROR perform_selected_item_checks(ItemSelector& sel);
62 
63 public:
64 
65  CheckedConsistencies(GBDATA *gb_main_) : gb_main(gb_main_) {
66  GB_transaction ta(gb_main);
67  GBDATA *gb_checks = GB_search(gb_main, "checks", GB_CREATE_CONTAINER);
68 
69  for (GBDATA *gb_check = GB_entry(gb_checks, "check"); gb_check; gb_check = GB_nextEntry(gb_check)) {
70  consistencies.insert(GB_read_char_pntr(gb_check));
71  }
72 
73  species_count = GBT_get_species_count(gb_main);
74  sai_count = GBT_get_SAI_count(gb_main);
75  }
76 
77  bool was_performed(const string& check_name) const {
78  return consistencies.find(check_name) != consistencies.end();
79  }
80 
81  GB_ERROR register_as_performed(const string& check_name) {
83  if (was_performed(check_name)) {
84  printf("check '%s' already has been registered before. Duplicated check name?\n", check_name.c_str());
85  }
86  else {
87  GB_transaction ta(gb_main);
88 
89  GBDATA *gb_checks = GB_search(gb_main, "checks", GB_CREATE_CONTAINER);
90  GBDATA *gb_check = GB_create(gb_checks, "check", GB_STRING);
91 
92  if (!gb_check) error = GB_await_error();
93  else error = GB_write_string(gb_check, check_name.c_str());
94 
95  if (!error) consistencies.insert(check_name);
96  }
97  return error;
98  }
99 
100  void perform_check(const string& check_name,
101  GB_ERROR (*do_check)(GBDATA *gb_main, size_t species, size_t sais),
102  GB_ERROR& error)
103  {
104  if (!error && !was_performed(check_name)) {
105  arb_progress progress(check_name.c_str());
106  error = do_check(gb_main, species_count, sai_count);
107  if (!error) register_as_performed(check_name);
108  }
109  }
110 
111  void register_item_check(const string& check_name, item_check_fun item_check) {
112  if (!was_performed(check_name)) {
113  item_checks[check_name] = item_check;
114  }
115  }
116 
117  void perform_item_checks(GB_ERROR& error);
118 
120  GB_ERROR error = NULp;
121  GB_transaction ta(gb_main);
122 
123  GBDATA *gb_checks = GB_search(gb_main, "checks", GB_CREATE_CONTAINER);
124  for (GBDATA *gb_check = GB_entry(gb_checks, "check"); gb_check && !error; gb_check = GB_nextEntry(gb_check)) {
125  char *check_name = GB_read_string(gb_check);
126 
127 #if defined(DEBUG)
128  printf("Deleting check '%s'\n", check_name);
129 #endif // DEBUG
130  error = GB_delete(gb_check);
131  consistencies.erase(check_name);
132  free(check_name);
133  }
134  return error;
135  }
136 };
137 
138 GB_ERROR CheckedConsistencies::perform_selected_item_checks(ItemSelector& sel) {
139  GB_ERROR error = NULp;
140  item_check_iter end = item_checks.end();
141 
142  for (GBDATA *gb_cont = sel.get_first_item_container(gb_main, NULp, QUERY_ALL_ITEMS);
143  gb_cont && !error;
144  gb_cont = sel.get_next_item_container(gb_cont, QUERY_ALL_ITEMS))
145  {
146  for (GBDATA *gb_item = sel.get_first_item(gb_cont, QUERY_ALL_ITEMS);
147  gb_item && !error;
148  gb_item = sel.get_next_item(gb_item, QUERY_ALL_ITEMS))
149  {
150  for (item_check_iter chk = item_checks.begin(); chk != end && !error; ++chk) {
151  error = chk->second(gb_item, sel);
152  }
153  }
154  }
155 
156  return error;
157 }
158 
160  if (!item_checks.empty()) {
161  if (!error) {
162  GB_transaction ta(gb_main);
163  bool is_genome_db = GEN_is_genome_db(gb_main, -1);
164 
165  error = perform_selected_item_checks(SPECIES_get_selector());
166  if (!error && is_genome_db) {
167  error = perform_selected_item_checks(GEN_get_selector());
168  if (!error) error = perform_selected_item_checks(EXP_get_selector());
169  }
170 
171  error = ta.close(error);
172  }
173 
174  if (!error) {
175  item_check_iter end = item_checks.end();
176  for (item_check_iter chk = item_checks.begin(); chk != end && !error; ++chk) {
177  error = register_as_performed(chk->first);
178  }
179 
180  if (!error) item_checks.clear();
181  }
182  }
183 }
184 
185 // --------------------------------------------------------------------------------
186 
187 static GB_ERROR NT_fix_gene_data(GBDATA *gb_main, size_t species_count, size_t /* sai_count */) {
188  GB_transaction ta(gb_main);
189  arb_progress progress(species_count);
190 
191  size_t deleted_gene_datas = 0;
192  size_t generated_gene_datas = 0;
193  GB_ERROR error = NULp;
194 
195  for (GBDATA *gb_species = GBT_first_species(gb_main);
196  gb_species && !error;
197  gb_species = GBT_next_species(gb_species))
198  {
199  bool is_organism = (GB_entry(gb_species, GENOM_ALIGNMENT)); // same test as GEN_is_organism, but w/o genome-db-assertion
200  GBDATA *gb_gene_data = GEN_find_gene_data(gb_species);
201 
202  if (is_organism && !gb_gene_data) {
203  gb_gene_data = GEN_findOrCreate_gene_data(gb_species); // @@@ check result & handle error
204  generated_gene_datas++;
205  }
206  else if (!is_organism && gb_gene_data) {
207  GBDATA *gb_child = GB_child(gb_gene_data);
208  if (!gb_child) {
209  error = GB_delete(gb_gene_data);
210  if (!error) deleted_gene_datas++;
211  }
212  else {
213  error = GBS_global_string("Non-empty 'gene_data' found for species '%s',\n"
214  "which has no alignment '" GENOM_ALIGNMENT "',\n"
215  "i.e. which is not regarded as full-genome organism.\n"
216  "This causes problems - please fix!",
217  GBT_get_name_or_description(gb_species));
218  }
219  }
220 
221  progress.inc_and_check_user_abort(error);
222  }
223 
224  if (!error) {
225  if (deleted_gene_datas) {
226  aw_message(GBS_global_string("Deleted %zu useless empty 'gene_data' entries.", deleted_gene_datas));
227  }
228  if (generated_gene_datas) {
229  aw_message(GBS_global_string("Re-created %zu missing 'gene_data' entries.\nThese organisms have no genes yet!", generated_gene_datas));
230  }
231  }
232  return ta.close(error);
233 }
234 
235 // --------------------------------------------------------------------------------
236 
237 static GBDATA *expectField(GBDATA *gb_gene, const char *field, GB_ERROR& data_error) {
238  GBDATA *gb_field = NULp;
239  if (!data_error) {
240  gb_field = GB_entry(gb_gene, field);
241  if (!gb_field) data_error = GBS_global_string("Expected field '%s' missing", field);
242  }
243  return gb_field;
244 }
245 
246 static GBDATA *disexpectField(GBDATA *gb_gene, const char *field, GB_ERROR& data_error) {
247  GBDATA *gb_field = NULp;
248  if (!data_error) {
249  gb_field = GB_entry(gb_gene, field);
250  if (gb_field) data_error = GBS_global_string("Unexpected field '%s' exists (wrong value in pos_joined?)", field);
251  }
252  GBS_reuse_buffer(field);
253  return gb_field;
254 }
255 
256 static GB_ERROR NT_convert_gene_locations(GBDATA *gb_main, size_t species_count, size_t /* sai_count */) {
257  GB_transaction ta(gb_main);
258 
259  GB_ERROR error = NULp;
260  long fixed_genes = 0;
261  long skipped_genes = 0;
262  long genes = 0;
263 
264  typedef vector<GBDATA*> GBvec;
265  GBvec toDelete;
266 
267  arb_progress progress(species_count);
268 
269  for (GBDATA *gb_organism = GEN_first_organism(gb_main);
270  gb_organism && !error;
271  gb_organism = GEN_next_organism(gb_organism))
272  {
273  GBDATA *gb_gene_data = GEN_find_gene_data(gb_organism);
274  nt_assert(gb_gene_data);
275  if (gb_gene_data) {
276  for (GBDATA *gb_gene = GEN_first_gene_rel_gene_data(gb_gene_data);
277  gb_gene && !error;
278  gb_gene = GEN_next_gene(gb_gene))
279  {
280  genes++;
281 
282  int parts = 1;
283  {
284  GBDATA *gb_pos_joined = GB_entry(gb_gene, "pos_joined");
285  if (gb_pos_joined) parts = GB_read_int(gb_pos_joined); // its a joined gene
286  }
287 
288  GBDATA *gb_pos_start = GB_entry(gb_gene, "pos_start"); // test for new format
289  if (!gb_pos_start) {
290  GBDATA *gb_pos_begin = GB_entry(gb_gene, "pos_begin"); // test for old format
291  if (!gb_pos_begin) {
292  error = "Neither 'pos_begin' nor 'pos_start' found - format of gene location is unknown";
293  }
294  }
295 
296  if (!gb_pos_start && !error) { // assume old format
297  // parts<-1 would be valid in new format, but here we have old format
298  if (parts<1) error = GBS_global_string("Illegal value in 'pos_joined' (%i)", parts);
299 
300  GB_ERROR data_error = NULp; // error in this gene -> don't convert
301  GEN_position *pos = GEN_new_position(parts, false); // all were joinable (no information about it was stored)
302 
303  // parse old gene information into 'pos'
304  //
305  // old-format was:
306  // Start-Positions: pos_begin, pos_begin2, pos_begin3, ...
307  // End-Positions: pos_end, pos_end2, pos_end3, ...
308  // Joined?: pos_joined (always >= 1)
309  // Complement: complement (one entry for all parts)
310  // Certainty: pos_uncertain (maybe pos_uncertain1 etc.)
311 
312  int complement = 0;
313  {
314  GBDATA *gb_complement = GB_entry(gb_gene, "complement");
315  if (gb_complement) {
316  complement = GB_read_byte(gb_complement);
317  toDelete.push_back(gb_complement);
318  }
319  }
320 
321  bool has_uncertain_fields = false;
322  for (int p = 1; p <= parts && !error && !data_error; ++p) {
323  GBDATA *gb_pos_begin = NULp;
324  GBDATA *gb_pos_end = NULp;
325  const char *pos_uncertain_field = NULp;
326 
327  if (p == 1) {
328  gb_pos_begin = expectField(gb_gene, "pos_begin", data_error);
329  gb_pos_end = expectField(gb_gene, "pos_end", data_error);
330 
331  pos_uncertain_field = "pos_uncertain";
332  }
333  else {
334  const char *pos_begin_field = GBS_global_string("pos_begin%i", p);
335  const char *pos_end_field = GBS_global_string("pos_end%i", p);
336 
337  gb_pos_begin = expectField(gb_gene, pos_begin_field, data_error);
338  gb_pos_end = expectField(gb_gene, pos_end_field, data_error);
339 
340  GBS_reuse_buffer(pos_end_field);
341  GBS_reuse_buffer(pos_begin_field);
342 
343  if (!data_error) pos_uncertain_field = GBS_global_string("pos_uncertain%i", p);
344  }
345 
346  int pospos = complement ? (parts-p) : (p-1);
347 
348  if (!data_error) {
349  GBDATA *gb_pos_uncertain = GB_entry(gb_gene, pos_uncertain_field);
350 
351  if (!gb_pos_uncertain) {
352  if (has_uncertain_fields) data_error = GBS_global_string("Expected field '%s' missing", pos_uncertain_field);
353  }
354  else {
355  if (p == 1) has_uncertain_fields = true;
356  else {
357  if (!has_uncertain_fields) {
358  data_error = GBS_global_string("Found '%s' as first certainty-information", pos_uncertain_field);
359  }
360  }
361  }
362 
363  if (!data_error) {
364  int begin = GB_read_int(gb_pos_begin);
365  int end = GB_read_int(gb_pos_end);
366 
367  pos->start_pos[pospos] = begin;
368  pos->stop_pos[pospos] = end;
369  pos->complement[pospos] = complement; // set all complement entries to same value (old format only had one complement entry)
370 
371  if (gb_pos_uncertain) {
372  const char *uncertain = GB_read_char_pntr(gb_pos_uncertain);
373 
374  if (!uncertain) error = GB_await_error();
375  else {
376  if (!pos->start_uncertain) GEN_use_uncertainties(pos);
377 
378  if (strlen(uncertain) != 2) {
379  data_error = "wrong length";
380  }
381  else {
382  for (int up = 0; up<2; up++) {
383  if (!strchr("<=>", uncertain[up])) {
384  data_error = GBS_global_string("illegal character '%c'", uncertain[up]);
385  }
386  else {
387  (up == 0 ? pos->start_uncertain[pospos] : pos->stop_uncertain[pospos]) = uncertain[up];
388  }
389  }
390  }
391 
392 
393  toDelete.push_back(gb_pos_uncertain);
394  }
395  }
396 
397  toDelete.push_back(gb_pos_begin);
398  toDelete.push_back(gb_pos_end);
399  }
400  }
401  }
402 
403  for (int p = parts+1; p <= parts+4 && !error && !data_error; ++p) {
404  disexpectField(gb_gene, GBS_global_string("pos_begin%i", p), data_error);
405  disexpectField(gb_gene, GBS_global_string("pos_end%i", p), data_error);
406  disexpectField(gb_gene, GBS_global_string("complement%i", p), data_error);
407  disexpectField(gb_gene, GBS_global_string("pos_uncertain%i", p), data_error);
408  }
409 
410  // now save new position data
411 
412  if (data_error) {
413  skipped_genes++;
414  }
415  else if (!error) {
416  error = GEN_write_position(gb_gene, pos, 0);
417 
418  if (!error) {
419  // delete old-format entries
420  GBvec::const_iterator end = toDelete.end();
421  for (GBvec::const_iterator i = toDelete.begin(); i != end && !error; ++i) {
422  GBDATA *gb_del = *i;
423  error = GB_delete(gb_del);
424  }
425 
426  if (!error) fixed_genes++;
427  }
428  }
429 
430  toDelete.clear();
431  GEN_free_position(pos);
432 
433  if (data_error || error) {
434  char *gene_id = GEN_global_gene_identifier(gb_gene, gb_organism);
435  if (error) {
436  error = GBS_global_string("Gene '%s': %s", gene_id, error);
437  }
438  else {
439  aw_message(GBS_global_string("Gene '%s' was not converted, fix data manually!\nReason: %s", gene_id, data_error));
440  }
441  free(gene_id);
442  }
443  }
444  }
445  }
446 
447  progress.inc_and_check_user_abort(error);
448  }
449 
450  if (!error) {
451  if (fixed_genes>0) aw_message(GBS_global_string("Fixed location entries of %li genes.", fixed_genes));
452  if (skipped_genes>0) {
453  aw_message(GBS_global_string("Didn't fix location entries of %li genes (see warnings).", skipped_genes));
454  error = "Not all gene locations were fixed.\nFix manually, save DB and restart ARB with that DB.\nMake sure you have a backup copy of the original DB!";
455  }
456 
457  if (fixed_genes || skipped_genes) {
458  long already_fixed_genes = genes-(fixed_genes+skipped_genes);
459  if (already_fixed_genes>0) aw_message(GBS_global_string("Location entries of %li genes already were in new format.", already_fixed_genes));
460  }
461  }
462 
463  return error;
464 }
465 
466 
467 // --------------------------------------------------------------------------------
468 
469 static GB_ERROR NT_del_mark_move_REF(GBDATA *gb_main, size_t species_count, size_t sai_count) {
470  GB_transaction ta(gb_main);
471  GB_ERROR error = NULp;
472  size_t all = species_count+sai_count;
473  size_t removed = 0;
474 
475  // delete 'mark' entries from all alignments of species/SAIs
476 
477  arb_progress progress(all);
478  ConstStrArray ali_names;
479  GBT_get_alignment_names(ali_names, gb_main);
480 
481  for (int pass = 0; pass < 2 && !error; ++pass) {
482  for (GBDATA *gb_item = (pass == 0) ? GBT_first_species(gb_main) : GBT_first_SAI(gb_main);
483  gb_item && !error;
484  gb_item = (pass == 0) ? GBT_next_species(gb_item) : GBT_next_SAI(gb_item))
485  {
486  for (int ali = 0; ali_names[ali] && !error; ++ali) {
487  GBDATA *gb_ali = GB_entry(gb_item, ali_names[ali]);
488  if (gb_ali) {
489  GBDATA *gb_mark = GB_entry(gb_ali, "mark");
490  if (gb_mark) {
491  error = GB_delete(gb_mark);
492  removed++;
493  }
494  }
495  }
496 
497  progress.inc_and_check_user_abort(error);
498  }
499  }
500 
501  {
502  char *helix_name = GBT_get_default_helix(gb_main);
503  GBDATA *gb_helix = GBT_find_SAI(gb_main, helix_name);
504 
505  if (gb_helix) {
506  for (int ali = 0; ali_names[ali] && !error; ++ali) {
507  GBDATA *gb_ali = GB_entry(gb_helix, ali_names[ali]);
508  GBDATA *gb_old_ref = GB_entry(gb_ali, "REF");
509  GBDATA *gb_new_ref = GB_entry(gb_ali, "_REF");
510 
511  if (gb_old_ref) {
512  if (gb_new_ref) {
513  error = GBS_global_string("SAI:%s has 'REF' and '_REF' in '%s' (data corrupt?!)",
514  helix_name, ali_names[ali]);
515  }
516  else { // move info from REF -> _REF
517  char *content = GB_read_string(gb_old_ref);
518  if (!content) error = GB_await_error();
519  else {
520  gb_new_ref = GB_create(gb_ali, "_REF", GB_STRING);
521  if (!gb_new_ref) error = GB_await_error();
522  else {
523  error = GB_write_string(gb_new_ref, content);
524  if (!error) error = GB_delete(gb_old_ref);
525  }
526  free(content);
527  }
528  }
529  }
530  }
531  }
532 
533  free(helix_name);
534  }
535 
536  if (!error) {
537  if (removed) {
538  aw_message(GBS_global_string("Deleted %zu useless 'mark' entries.", removed));
539  }
540  }
541 
542  return ta.close(error);
543 }
544 
545 // --------------------------------------------------------------------------------
546 
547 static bool testDictionaryCompression(GBDATA *gbd, GBQUARK key_quark, bool testUse) {
548  // returns true, if
549  // testUse == true and ANY entries below 'gbd' with quark 'key_quark' uses dictionary compression
550  // testUse == false and ALL entries below 'gbd' with quark 'key_quark' can be decompressed w/o errors
551 
552  nt_assert(GB_read_type(gbd) == GB_DB);
553 
554  for (GBDATA *gb_sub = GB_child(gbd); gb_sub; gb_sub = GB_nextChild(gb_sub)) {
555  switch (GB_read_type(gb_sub)) {
556  case GB_DB:
557  // return false if any compression failed or return true if any uses dict-compression
558  if (testDictionaryCompression(gb_sub, key_quark, testUse) == testUse) return testUse;
559  break;
560 
561  case GB_STRING:
562  if (GB_get_quark(gb_sub) == key_quark && GB_is_dictionary_compressed(gb_sub)) {
563  if (testUse) return true;
564 
565  const char *decompressed = GB_read_char_pntr(gb_sub);
566  if (!decompressed) return false;
567  }
568  break;
569 
570  default:
571  break;
572  }
573  }
574 
575  return !testUse;
576 }
577 
578 class Dict;
579 typedef SmartPtr<Dict> DictPtr;
580 
581 
582 class KeyInfo : virtual Noncopyable {
583  string name; // keyname
584  DictPtr original;
585 
586  bool compressionTested;
587  bool compressed;
588 
589  void init() {
590  compressionTested = false;
591  compressed = false;
592  }
593 
594 public:
595  KeyInfo(const char *Name) : name(Name) { init(); }
596  KeyInfo(const char *Name, DictPtr originalDict) : name(Name), original(originalDict) { init(); }
597 
598  void testCompressed(GBDATA *gb_main) {
599  nt_assert(!compressionTested);
600  compressed = testDictionaryCompression(gb_main, GB_find_or_create_quark(gb_main, name.c_str()), true);
601  compressionTested = true;
602  }
603 
604  const string& getName() const { return name; }
605 
606  bool isCompressed() const {
607  nt_assert(compressionTested);
608  return compressed;
609  }
610 };
611 
612 
613 class Dict : virtual Noncopyable {
614  string group; // lowercase keyname
615  string orgkey;
616  DictData *data;
617 
618  map<string, bool> decompressWorks; // key -> bool
619 
620 public:
621  static GBDATA *gb_main;
622 
623  Dict(const char *Group, const char *OrgKey, DictData *Data) : group(Group), orgkey(OrgKey), data(Data) {}
624 
625  const string& getGroup() const { return group; }
626  const string& getOriginalKey() const { return orgkey; }
627 
628  bool mayBeUsedWith(const string& key) const { return strcasecmp(group.c_str(), key.c_str()) == 0; }
629 
630  GB_ERROR assignToKey(const string& key) const { return GB_set_dictionary(gb_main, key.c_str(), data); }
631  GB_ERROR unassignFromKey(const string& key) const { return GB_set_dictionary(gb_main, key.c_str(), NULp); }
632 
633  bool canDecompress(const string& key) {
634  nt_assert(mayBeUsedWith(key));
635  if (decompressWorks.find(key) == decompressWorks.end()) {
636  bool works = false;
637  GB_ERROR error = assignToKey(key);
638 
639  if (!error) works = testDictionaryCompression(gb_main, GB_find_or_create_quark(gb_main, key.c_str()), false);
640  decompressWorks[key] = works;
641 
642  GB_ERROR err2 = unassignFromKey(key);
643  if (err2) {
644  aw_message(GBS_global_string("Error while removing @dictionary from key '%s': %s", key.c_str(), err2));
645  }
646  }
647  return decompressWorks[key];
648  }
649 };
651 
652 
653 typedef map<string, int> KeyCounter; // groupname -> occur count
655 typedef map<string, KeyInfoPtr> Keys; // keyname -> info
656 typedef map<string, DictPtr> DictMap;
657 typedef vector<DictPtr> Dicts;
658 typedef set<string> StringSet;
659 
660 #define STATUS_PREFIX "Dictionary: "
661 
662 template<typename CONT, typename KEY>
663 bool contains(const CONT& container, const KEY& key) {
664  return container.find(key) != container.end();
665 }
666 
667 static GB_ERROR findAffectedKeys(GBDATA *gb_key_data, KeyCounter& kcount, Keys& keys, Dicts& dicts) {
668  GB_ERROR error = NULp;
669  GBDATA *gb_main = GB_get_root(gb_key_data);
670 
671  for (int pass = 1; pass <= 2; ++pass) {
672  for (GBDATA *gb_key = GB_entry(gb_key_data, "@key"); !error && gb_key; gb_key = GB_nextEntry(gb_key)) {
673  GBDATA *gb_name = GB_entry(gb_key, "@name");
674  const char *keyName = GB_read_char_pntr(gb_name);
675 
676  if (!keyName) {
677  error = GBS_global_string("@key w/o @name (%s)", GB_await_error());
678  }
679  else {
680  char *keyGroup = ARB_strdup(keyName);
681  ARB_strlower(keyGroup);
682 
683  switch (pass) {
684  case 1:
685  kcount[keyGroup]++;
686  break;
687  case 2:
688  if (kcount[keyGroup]>1) {
689  GBDATA *gb_dictionary = GB_entry(gb_key, "@dictionary");
690  if (gb_dictionary) {
691  DictPtr dict = new Dict(keyGroup, keyName, GB_get_dictionary(gb_main, keyName));
692  keys[keyName] = new KeyInfo(keyName, dict);
693  dicts.push_back(dict);
694  }
695  else keys[keyName] = new KeyInfo(keyName);
696  }
697  else kcount.erase(keyGroup);
698  break;
699  }
700  free(keyGroup);
701  }
702  }
703  }
704  return error;
705 }
706 
707 static GB_ERROR deleteDataOfKey(GBDATA *gbd, GBQUARK key_quark, StringSet& deletedData, long& deleted, long& notDeleted) {
708  GB_ERROR error = NULp;
709  for (GBDATA *gb_sub = GB_child(gbd); gb_sub; gb_sub = GB_nextChild(gb_sub)) {
710  switch (GB_read_type(gb_sub)) {
711  case GB_DB:
712  error = deleteDataOfKey(gb_sub, key_quark, deletedData, deleted, notDeleted);
713  break;
714 
715  case GB_STRING:
716  if (GB_get_quark(gb_sub) == key_quark) {
717  if (GB_is_dictionary_compressed(gb_sub)) {
718  string path(GB_get_db_path(gb_sub));
719  error = GB_delete(gb_sub);
720  if (!error) {
721  deletedData.insert(path);
722  deleted++;
723  }
724  }
725  else {
726  notDeleted++;
727  }
728  }
729  break;
730  default:
731  break;
732  }
733  }
734  return error;
735 }
736 
737 static char *readFirstCompressedDataOf(GBDATA *gbd, GBQUARK key_quark) {
738  char *data = NULp;
739  for (GBDATA *gb_sub = GB_child(gbd); !data && gb_sub; gb_sub = GB_nextChild(gb_sub)) {
740  switch (GB_read_type(gb_sub)) {
741  case GB_DB:
742  data = readFirstCompressedDataOf(gb_sub, key_quark);
743  break;
744 
745  case GB_STRING:
746  if (GB_get_quark(gb_sub) == key_quark) {
747  if (GB_is_dictionary_compressed(gb_sub)) {
748  data = GB_read_as_string(gb_sub);
749  }
750  }
751  break;
752  default:
753  break;
754  }
755  }
756  return data;
757 }
758 
759 
760 static GB_ERROR NT_fix_dict_compress(GBDATA *gb_main, size_t, size_t) {
761  GB_transaction ta(gb_main);
762  GBDATA *gb_key_data = GB_search(gb_main, GB_SYSTEM_FOLDER "/" GB_SYSTEM_KEY_DATA, GB_FIND);
763  GB_ERROR error = NULp;
764 
765  Dict::gb_main = gb_main;
766 
767  if (!gb_key_data) {
768  error = "No " GB_SYSTEM_KEY_DATA " found.. DB corrupted?";
769  }
770  else {
771  KeyCounter kcount; // strlwr(keyname) -> count
772  Keys keys;
773  Dicts dicts;
774 
775  error = findAffectedKeys(gb_key_data, kcount, keys, dicts);
776 
777  // count affected keys
778  long affectedKeys = 0;
779  for (KeyCounter::iterator kci = kcount.begin(); kci != kcount.end(); ++kci) {
780  affectedKeys += kci->second;
781  }
782 
783  if (!error && affectedKeys>0) {
784  // check which keys are compressed
785 
786  {
787  arb_progress progress(STATUS_PREFIX "search compressed data", affectedKeys);
788 
789  for (Keys::iterator ki = keys.begin(); ki != keys.end(); ++ki) {
790  KeyInfoPtr k = ki->second;
791  k->testCompressed(gb_main);
792  ++progress;
793  }
794  }
795 
796  // test which key/dict combinations work
797  long combinations = 0; // possible key/dict combinations
798 
799  DictMap use; // keyname -> dictionary (which dictionary to use)
800  StringSet multiDecompressible; // keys which can be decompressed with multiple dictionaries
801 
802  for (int pass = 1; pass <= 2; ++pass) {
803  arb_progress *progress = NULp;
804  if (pass == 2 && combinations) progress = new arb_progress(STATUS_PREFIX "test compression", combinations);
805 
806  for (Dicts::iterator di = dicts.begin(); di != dicts.end(); ++di) {
807  DictPtr d = *di;
808 
809  for (Keys::iterator ki = keys.begin(); ki != keys.end(); ++ki) {
810  KeyInfoPtr k = ki->second;
811  const string& keyname = k->getName();
812 
813  if (k->isCompressed() && d->mayBeUsedWith(keyname)) {
814  switch (pass) {
815  case 1:
816  combinations++;
817  break;
818  case 2:
819  if (d->canDecompress(keyname)) {
820  if (!contains(use, keyname)) { // first dictionary working with keyname
821  use[keyname] = d;
822  }
823  else { // already have another dictionary working with keyname
824  multiDecompressible.insert(keyname);
825  }
826  }
827  ++(*progress);
828  break;
829  }
830  }
831  }
832  }
833  delete progress;
834  }
835 
836  StringSet notDecompressible; // keys which can be decompressed with none of the dictionaries
837  for (Keys::iterator ki = keys.begin(); ki != keys.end(); ++ki) {
838  KeyInfoPtr k = ki->second;
839  const string& keyname = k->getName();
840 
841  if (k->isCompressed()) {
842  if (!contains(use, keyname)) notDecompressible.insert(keyname);
843  if (contains(multiDecompressible, keyname)) use.erase(keyname);
844  }
845  }
846 
847  bool dataLost = false;
848  int reassigned = 0;
849 
850  if (!notDecompressible.empty()) {
851  // bad .. found undecompressible data
852  long nd_count = notDecompressible.size();
853  aw_message(GBS_global_string("Detected corrupted dictionary compression\n"
854  "Data of %li DB-keys is lost and will be deleted", nd_count));
855 
856  arb_progress progress(STATUS_PREFIX "deleting corrupt data", nd_count);
857 
858  StringSet deletedData;
859  long deleted = 0;
860  long notDeleted = 0;
861 
862  for (StringSet::iterator ki = notDecompressible.begin(); !error && ki != notDecompressible.end(); ++ki) {
863  const string& keyname = *ki;
864 
865  error = deleteDataOfKey(gb_main, GB_find_or_create_quark(gb_main, keyname.c_str()), deletedData, deleted, notDeleted);
866  ++progress;
867  }
868 
869  if (!error) {
870  nt_assert(deleted); // at least 1 db-entry should have been deleted
871 
872  aw_message(GBS_global_string("Deleted %li of %li affected DB-entries", deleted, deleted+notDeleted));
873  aw_message("see console for a list of affected keys");
874 
875  printf("Deleted keys:\n");
876  for (StringSet::iterator di = deletedData.begin(); di != deletedData.end(); ++di) {
877  printf("* %s\n", di->c_str());
878  }
879  }
880  }
881 
882  if (!error && !multiDecompressible.empty()) {
883  for (StringSet::iterator ki = multiDecompressible.begin(); !error && ki != multiDecompressible.end(); ++ki) {
884  const string& keyname = *ki;
885  int possible = 0;
886  vector<DictPtr> possibleDicts;
887 
888  printf("--------------------------------------------------------------------------------\n");
889 
890  for (Dicts::iterator di = dicts.begin(); !error && di != dicts.end(); ++di) {
891  DictPtr d = *di;
892  if (d->mayBeUsedWith(keyname) && d->canDecompress(keyname)) {
893  error = d->assignToKey(keyname);
894  if (!error) {
895  char *data = readFirstCompressedDataOf(gb_main, GB_find_or_create_quark(gb_main, keyname.c_str()));
896 
897  nt_assert(data);
898  possible++;
899  printf("possibility %i = '%s'\n", possible, data);
900  free(data);
901 
902  possibleDicts.push_back(d);
903 
904  error = d->unassignFromKey(keyname);
905  }
906  }
907  }
908 
909  if (!error) {
910  nt_assert(possible>0);
911 
912  int selected;
913  if (possible>1) {
914  char *question = GBS_global_string_copy("%i possibilities to decompress field '%s' have been detected\n"
915  "and example data was dumped to the console.\n"
916  "Please examine output and decide which is the correct possibility!",
917  possible, keyname.c_str());
918 
919  const char *buttons = "Abort";
920  for (int p = 1; p <= possible; ++p) buttons = GBS_global_string("%s,%i", buttons, p);
921  selected = aw_question("dict_decompress_bug", question, buttons, false, NULp);
922  free(question);
923  }
924  else {
925  selected = 1;
926  }
927 
928  if (!selected) {
929  error = "Aborted by user";
930  }
931  else {
932  use[keyname] = possibleDicts[selected-1];
933  }
934  }
935  }
936  }
937 
938  // now all redundancies should be eliminated and we can assign dictionaries to affected keys
939  if (!error) {
940  for (Keys::iterator ki = keys.begin(); !error && ki != keys.end(); ++ki) {
941  KeyInfoPtr k = ki->second;
942  const string& keyname = k->getName();
943 
944  if (k->isCompressed()) {
945  if (!contains(use, keyname)) {
946  error = GBS_global_string("No dictionary detected for key '%s'", keyname.c_str());
947  }
948  else {
949  DictPtr d = use[keyname];
950 
951  if (d->getOriginalKey() != keyname) {
952  d->assignToKey(keyname); // set the dictionary
953  aw_message(GBS_global_string("Assigning '%s'-dictionary to '%s'",
954  d->getOriginalKey().c_str(), keyname.c_str()));
955  reassigned++;
956  }
957  }
958  }
959  }
960  }
961 
962  if (dataLost||reassigned) {
963  aw_message(dataLost
964  ? "We apologize for the data-loss."
965  : "No conflicts detected in compressed data.");
966  aw_message("Dictionaries fixed.\n"
967  "Please save your database with a new name.");
968  }
969  }
970  }
971 
972  Dict::gb_main = NULp;
973  return ta.close(error);
974 }
975 
976 // --------------------------------------------------------------------------------
977 
979  // Databases out there may contain multiple 'ARB_color' entries.
980  // Due to some already fixed bug - maybe introduced in r5309 and fixed in r5825
981 
982  GBDATA *gb_color = GB_entry(gb_item, GB_COLORGROUP_ENTRY);
983  GB_ERROR error = NULp;
984 
985 #if defined(DEBUG)
986  int del_count = 0;
987 #endif // DEBUG
988 
989  if (gb_color) {
990  GB_topSecurityLevel unsecured(gb_color);
991  while (!error) {
992  GBDATA *gb_next_color = GB_nextEntry(gb_color);
993  if (!gb_next_color) break;
994 
995  error = GB_delete(gb_next_color);
996 #if defined(DEBUG)
997  if (!error) del_count++;
998 #endif // DEBUG
999  }
1000  }
1001 
1002 #if defined(DEBUG)
1003  if (del_count) fprintf(stderr,
1004  "- deleted %i duplicated '" GB_COLORGROUP_ENTRY "' from %s '%s'\n",
1005  del_count,
1006  sel.item_name,
1007  sel.generate_item_id(GB_get_root(gb_item), gb_item));
1008 #endif // DEBUG
1009 
1010  return error;
1011 }
1012 
1013 // --------------------------------------------------------------------------------
1014 
1016  // status is already open and will be closed by caller!
1017 
1018  CheckedConsistencies check(gb_main);
1019  GB_ERROR err = NULp;
1020  bool is_genome_db;
1021  {
1022  GB_transaction ta(gb_main);
1023  is_genome_db = GEN_is_genome_db(gb_main, -1);
1024  }
1025 
1026  check.perform_check("fix gene_data", NT_fix_gene_data, err);
1027  check.perform_check("fix_dict_compress", NT_fix_dict_compress, err); // do this before NT_del_mark_move_REF (cause 'REF' is affected)
1028  check.perform_check("del_mark_move_REF", NT_del_mark_move_REF, err);
1029 
1030  if (is_genome_db) {
1031  check.perform_check("convert_gene_locations", NT_convert_gene_locations, err);
1032  }
1033 
1034  check.register_item_check("duplicated_item_colors", remove_dup_colors);
1035  check.perform_item_checks(err);
1036 
1037  return err;
1038 }
1039 
1040 void NT_rerepair_DB(AW_window*, GBDATA *gb_main) {
1041  // re-perform all DB checks
1042  GB_ERROR err = NULp;
1043  {
1044  CheckedConsistencies check(gb_main);
1045  err = check.forgetDoneChecks();
1046  }
1047  if (!err) {
1048  arb_progress progress("DB-Repair");
1049  err = NT_repair_DB(gb_main);
1050  }
1051 
1052  if (err) aw_message(err);
1053 }
1054 
1055 
ItemSelector & GEN_get_selector()
const char * GB_ERROR
Definition: arb_core.h:25
#define GB_SYSTEM_FOLDER
Definition: arbdb.h:27
GBDATA * GBT_first_SAI(GBDATA *gb_main)
Definition: aditem.cxx:162
const char * item_name
Definition: items.h:66
void testCompressed(GBDATA *gb_main)
unsigned char * complement
Definition: adGene.h:41
group_matcher all()
Definition: test_unit.h:1011
GBDATA * GEN_next_gene(GBDATA *gb_gene)
Definition: adGene.cxx:138
long GB_read_int(GBDATA *gbd)
Definition: arbdb.cxx:729
bool canDecompress(const string &key)
GBDATA * GB_child(GBDATA *father)
Definition: adquery.cxx:322
KeyInfo(const char *Name, DictPtr originalDict)
GB_ERROR GB_write_string(GBDATA *gbd, const char *s)
Definition: arbdb.cxx:1387
void GEN_free_position(GEN_position *pos)
Definition: adGene.cxx:195
static GBDATA * gb_main
static GB_ERROR NT_convert_gene_locations(GBDATA *gb_main, size_t species_count, size_t)
GBDATA * GB_nextEntry(GBDATA *entry)
Definition: adquery.cxx:339
GB_ERROR assignToKey(const string &key) const
GBDATA * GEN_findOrCreate_gene_data(GBDATA *gb_species)
Definition: adGene.cxx:44
long GBT_get_SAI_count(GBDATA *gb_main)
Definition: aditem.cxx:211
void GBT_get_alignment_names(ConstStrArray &names, GBDATA *gbd)
Definition: adali.cxx:317
static GB_ERROR deleteDataOfKey(GBDATA *gbd, GBQUARK key_quark, StringSet &deletedData, long &deleted, long &notDeleted)
char * ARB_strdup(const char *str)
Definition: arb_string.h:27
char * GB_read_as_string(GBDATA *gbd)
Definition: arbdb.cxx:1060
static GBDATA * disexpectField(GBDATA *gb_gene, const char *field, GB_ERROR &data_error)
const char * GBS_global_string(const char *templat,...)
Definition: arb_msg.cxx:203
STL namespace.
GB_ERROR unassignFromKey(const string &key) const
void GEN_use_uncertainties(GEN_position *pos)
Definition: adGene.cxx:184
static GBDATA * expectField(GBDATA *gb_gene, const char *field, GB_ERROR &data_error)
const string & getOriginalKey() const
set< string > StringSet
Dict(const char *Group, const char *OrgKey, DictData *Data)
void register_item_check(const string &check_name, item_check_fun item_check)
static GB_ERROR NT_fix_gene_data(GBDATA *gb_main, size_t species_count, size_t)
map< string, item_check_fun > item_check_map
Definition: NT_dbrepair.cxx:51
ItemSelector & EXP_get_selector()
GB_ERROR GB_delete(GBDATA *&source)
Definition: arbdb.cxx:1916
static GB_ERROR remove_dup_colors(GBDATA *gb_item, ItemSelector &IF_DEBUG(sel))
GBDATA * GBT_find_SAI(GBDATA *gb_main, const char *name)
Definition: aditem.cxx:177
GB_ERROR GB_await_error()
Definition: arb_msg.cxx:342
Definition: arbdb.h:78
GB_TYPES GB_read_type(GBDATA *gbd)
Definition: arbdb.cxx:1643
SmartPtr< KeyInfo > KeyInfoPtr
GBDATA * GB_create(GBDATA *father, const char *key, GB_TYPES type)
Definition: arbdb.cxx:1781
bool isCompressed() const
KeyInfo(const char *Name)
GB_ERROR register_as_performed(const string &check_name)
Definition: NT_dbrepair.cxx:81
unsigned char * start_uncertain
Definition: adGene.h:53
static int group[MAXN+1]
Definition: ClustalV.cxx:65
void perform_check(const string &check_name, GB_ERROR(*do_check)(GBDATA *gb_main, size_t species, size_t sais), GB_ERROR &error)
size_t * stop_pos
Definition: adGene.h:40
map< string, KeyInfoPtr > Keys
GBDATA *(* get_first_item_container)(GBDATA *, AW_root *, QUERY_RANGE)
Definition: items.h:70
void NT_rerepair_DB(AW_window *, GBDATA *gb_main)
#define GB_COLORGROUP_ENTRY
Definition: ad_colorset.h:21
static void error(const char *msg)
Definition: mkptypes.cxx:96
GBDATA * GB_get_root(GBDATA *gbd)
Definition: arbdb.cxx:1740
GBDATA *(* get_next_item_container)(GBDATA *, QUERY_RANGE)
Definition: items.h:71
#define STATUS_PREFIX
char * ARB_strlower(char *s)
Definition: arb_str.h:64
GBQUARK GB_find_or_create_quark(GBDATA *gbd, const char *key)
Definition: arbdb.cxx:1695
static GB_ERROR NT_fix_dict_compress(GBDATA *gb_main, size_t, size_t)
const string & getName() const
GBDATA * GEN_find_gene_data(GBDATA *gb_species)
Definition: adGene.cxx:50
GB_ERROR GB_set_dictionary(GBDATA *gb_main, const char *key, const DictData *dd)
Definition: adsystem.cxx:295
GBDATA * GEN_next_organism(GBDATA *gb_organism)
Definition: adGene.cxx:745
const string & getGroup() const
GB_ERROR GEN_write_position(GBDATA *gb_gene, const GEN_position *pos, long seqLength)
Definition: adGene.cxx:325
GBQUARK GB_get_quark(GBDATA *gbd)
Definition: arbdb.cxx:1703
char *(* generate_item_id)(GBDATA *gb_main, GBDATA *gb_item)
Definition: items.h:59
Definition: arbdb.h:86
void GBS_reuse_buffer(const char *global_buffer)
Definition: arb_msg.cxx:563
GBDATA *(* get_first_item)(GBDATA *, QUERY_RANGE)
Definition: items.h:73
#define GENOM_ALIGNMENT
Definition: adGene.h:19
CheckedConsistencies(GBDATA *gb_main_)
Definition: NT_dbrepair.cxx:65
static char * readFirstCompressedDataOf(GBDATA *gbd, GBQUARK key_quark)
GB_ERROR(* item_check_fun)(GBDATA *gb_item, ItemSelector &sel)
Definition: NT_dbrepair.cxx:49
static GB_ERROR findAffectedKeys(GBDATA *gb_key_data, KeyCounter &kcount, Keys &keys, Dicts &dicts)
vector< DictPtr > Dicts
#define nt_assert(cond)
Definition: NT_local.h:27
map< string, DictPtr > DictMap
map< string, int > KeyCounter
bool was_performed(const string &check_name) const
Definition: NT_dbrepair.cxx:77
GEN_position * GEN_new_position(int parts, bool joinable)
Definition: adGene.cxx:155
GBDATA *(* get_next_item)(GBDATA *, QUERY_RANGE)
Definition: items.h:74
int aw_question(const char *unique_id, const char *question, const char *buttons, bool sameSizeButtons, const char *helpfile)
Definition: AW_question.cxx:26
GBDATA * GBT_next_SAI(GBDATA *gb_sai)
Definition: aditem.cxx:166
char * GEN_global_gene_identifier(GBDATA *gb_gene, GBDATA *gb_organism)
Definition: adGene.cxx:783
GB_ERROR close(GB_ERROR error)
Definition: arbdbpp.cxx:35
bool mayBeUsedWith(const string &key) const
int GB_read_byte(GBDATA *gbd)
Definition: arbdb.cxx:734
char * GB_read_string(GBDATA *gbd)
Definition: arbdb.cxx:909
GB_ERROR forgetDoneChecks()
ItemSelector & SPECIES_get_selector()
Definition: species.cxx:139
GBDATA * GBT_first_species(GBDATA *gb_main)
Definition: aditem.cxx:124
static ARB_init_perl_interface init
Definition: ARB_ext.c:101
const char * GB_get_db_path(GBDATA *gbd)
Definition: adTest.cxx:14
void aw_message(const char *msg)
Definition: AW_status.cxx:1142
size_t * start_pos
Definition: adGene.h:39
GB_ERROR NT_repair_DB(GBDATA *gb_main)
char * GBT_get_default_helix(GBDATA *)
Definition: adtools.cxx:64
GBDATA * GBT_next_species(GBDATA *gb_species)
Definition: aditem.cxx:128
#define NULp
Definition: cxxforward.h:116
#define GB_SYSTEM_KEY_DATA
Definition: arbdb.h:28
static GB_ERROR NT_del_mark_move_REF(GBDATA *gb_main, size_t species_count, size_t sai_count)
bool contains(const CONT &container, const KEY &key)
SmartPtr< Dict > DictPtr
GBDATA * GB_nextChild(GBDATA *child)
Definition: adquery.cxx:326
static bool testDictionaryCompression(GBDATA *gbd, GBQUARK key_quark, bool testUse)
bool GB_is_dictionary_compressed(GBDATA *gbd)
Definition: adcompr.cxx:869
long GBT_get_species_count(GBDATA *gb_main)
Definition: aditem.cxx:207
GB_transaction ta(gb_var)
GB_CSTR GB_read_char_pntr(GBDATA *gbd)
Definition: arbdb.cxx:904
GBDATA * gb_main
Definition: adname.cxx:32
void perform_item_checks(GB_ERROR &error)
item_check_map::const_iterator item_check_iter
Definition: NT_dbrepair.cxx:52
#define IF_DEBUG(x)
Definition: arb_assert.h:303
GBDATA * GB_search(GBDATA *gbd, const char *fieldpath, GB_TYPES create)
Definition: adquery.cxx:531
GBDATA * GEN_first_organism(GBDATA *gb_main)
Definition: adGene.cxx:739
DictData * GB_get_dictionary(GBDATA *gb_main, const char *key)
Definition: adsystem.cxx:279
GB_CSTR GBT_get_name_or_description(GBDATA *gb_item)
Definition: aditem.cxx:459
bool GEN_is_genome_db(GBDATA *gb_main, int default_value)
Definition: adGene.cxx:20
GBDATA * GEN_first_gene_rel_gene_data(GBDATA *gb_gene_data)
Definition: adGene.cxx:134
GBDATA * GB_entry(GBDATA *father, const char *key)
Definition: adquery.cxx:334
void inc_and_check_user_abort(GB_ERROR &error)
Definition: arb_progress.h:332
Definition: Group.hxx:20
char * GBS_global_string_copy(const char *templat,...)
Definition: arb_msg.cxx:194
unsigned char * stop_uncertain
Definition: adGene.h:54