ARB
adali.cxx
Go to the documentation of this file.
1 // =============================================================== //
2 // //
3 // File : adali.cxx //
4 // Purpose : alignments //
5 // //
6 // Institute of Microbiology (Technical University Munich) //
7 // http://www.arb-home.de/ //
8 // //
9 // =============================================================== //
10 
11 #include <arbdbt.h>
12 #include <adGene.h>
13 
14 #include "gb_local.h"
15 
16 #include <arb_strarray.h>
17 #include <arb_str.h>
18 
19 static void check_for_species_without_data(const char *species_name, long value, void *counterPtr) {
20  if (value == 1) {
21  long cnt = *((long*)counterPtr);
22  if (cnt<40) {
23  GB_warningf("Species '%s' has no data in any alignment", species_name);
24  }
25  *((long*)counterPtr) = cnt+1;
26  }
27 }
28 
30  return GBT_find_or_create(gb_main, "presets", 7);
31 }
32 
34  int count = 0;
35  GBDATA *gb_presets = GBT_get_presets(gb_main);
36  for (GBDATA *gb_ali = GB_entry(gb_presets, "alignment");
37  gb_ali;
38  gb_ali = GB_nextEntry(gb_ali))
39  {
40  ++count;
41  }
42  return count;
43 }
44 
45 static GB_ERROR GBT_check_alignment(GBDATA *gb_main, GBDATA *preset_alignment, GB_HASH *species_name_hash) {
46  /* check
47  * - whether alignment has correct size and
48  * - whether all data is present.
49  *
50  * Sets the security deletes and writes.
51  *
52  * If 'species_name_hash' is not NULp,
53  * - it initially has to contain value == 1 for each existing species.
54  * - afterwards it will contain value == 2 for each species where an alignment has been found.
55  */
56 
58  GBDATA *gb_extended_data = GBT_get_SAI_data(gb_main);
59 
61  char *ali_name = GBT_read_string(preset_alignment, "alignment_name");
62  if (!ali_name) error = "Alignment w/o 'alignment_name'";
63 
64  if (!error) {
65  long security_write = -1;
66  long stored_ali_len = -1;
67  long found_ali_len = -1;
68  long aligned = 1;
69  GBDATA *gb_ali_len = NULp;
70 
71  {
72  GBDATA *gb_ali_wsec = GB_entry(preset_alignment, "alignment_write_security");
73  if (!gb_ali_wsec) {
74  error = "has no 'alignment_write_security' entry";
75  }
76  else {
77  security_write = GB_read_int(gb_ali_wsec);
78  }
79  }
80 
81 
82  if (!error) {
83  gb_ali_len = GB_entry(preset_alignment, "alignment_len");
84  if (!gb_ali_len) {
85  error = "has no 'alignment_len' entry";
86  }
87  else {
88  stored_ali_len = GB_read_int(gb_ali_len);
89  }
90  }
91 
92  if (!error) {
93  GBDATA *gb_species;
94  for (gb_species = GBT_first_species_rel_species_data(gb_species_data);
95  gb_species && !error;
96  gb_species = GBT_next_species(gb_species))
97  {
98  GBDATA *gb_name = GB_entry(gb_species, "name");
99  const char *name = NULp;
100  int alignment_seen = 0;
101 
102  if (!gb_name) {
103  // fatal: name is missing -> create a unique name
104  char *unique = GBT_create_unique_species_name(gb_main, "autoname.");
105  error = GBT_write_string(gb_species, "name", unique);
106 
107  if (!error) {
108  gb_name = GB_entry(gb_species, "name");
109  GBS_write_hash(species_name_hash, unique, 1); // not seen before
110  GB_warningf("Seen unnamed species (gave name '%s')", unique);
111  }
112  free(unique);
113  }
114 
115  if (!error) {
116  name = GB_read_char_pntr(gb_name);
117  if (species_name_hash) {
118  int seen = GBS_read_hash(species_name_hash, name);
119 
120  gb_assert(seen != 0); // species_name_hash not initialized correctly
121  if (seen == 2) alignment_seen = 1; // already seen an alignment
122  }
123  }
124 
125  if (!error) {
126  GB_topSecurityLevel unsecured(gb_name);
127 
128  error = GB_write_security_delete(gb_name, 7);
129  if (!error) error = GB_write_security_write(gb_name, 6);
130 
131  if (!error) {
132  GBDATA *gb_ali = GB_entry(gb_species, ali_name);
133  if (gb_ali) {
134  GBDATA *gb_data = GB_entry(gb_ali, "data");
135  if (!gb_data) {
136  error = GBT_write_string(gb_ali, "data", "Error: entry 'data' was missing and therefore was filled with this text.");
137  GB_warningf("No '%s/data' entry for species '%s' (has been filled with dummy data)", ali_name, name);
138  }
139  else {
140  if (GB_read_type(gb_data) != GB_STRING) {
141  GB_delete(gb_data);
142  error = GBS_global_string("'%s/data' of species '%s' had wrong DB-type (%s) and has been deleted!",
143  ali_name, name, GB_read_key_pntr(gb_data));
144  }
145  else {
146  long data_len = GB_read_string_count(gb_data);
147  if (found_ali_len != data_len) {
148  if (found_ali_len>0) aligned = 0;
149  if (found_ali_len<data_len) found_ali_len = data_len;
150  }
151 
152  error = GB_write_security_delete(gb_data, 7);
153 
154  if (!alignment_seen && species_name_hash) { // mark as seen
155  GBS_write_hash(species_name_hash, name, 2); // 2 means "species has data in at least 1 alignment"
156  }
157  }
158  }
159  }
160  }
161 
162  if (!error) error = GB_write_security_delete(gb_species, security_write);
163  }
164  }
165  }
166 
167  if (!error) {
168  GBDATA *gb_sai;
169  for (gb_sai = GBT_first_SAI_rel_SAI_data(gb_extended_data);
170  gb_sai && !error;
171  gb_sai = GBT_next_SAI(gb_sai))
172  {
173  GBDATA *gb_sai_name = GB_entry(gb_sai, "name");
174  GBDATA *gb_ali;
175 
176  if (!gb_sai_name) continue;
177 
178  GB_write_security_delete(gb_sai_name, 7);
179 
180  gb_ali = GB_entry(gb_sai, ali_name);
181  if (gb_ali) {
182  GBDATA *gb_sai_data;
183  for (gb_sai_data = GB_child(gb_ali);
184  gb_sai_data;
185  gb_sai_data = GB_nextChild(gb_sai_data))
186  {
187  long type = GB_read_type(gb_sai_data);
188  long data_len;
189 
190  if (type == GB_DB || type < GB_BITS) continue;
191  if (GB_read_key_pntr(gb_sai_data)[0] == '_') continue; // e.g. _STRUCT (of secondary structure)
192 
193  data_len = GB_read_count(gb_sai_data);
194 
195  if (found_ali_len != data_len) {
196  if (found_ali_len>0) aligned = 0;
197  if (found_ali_len<data_len) found_ali_len = data_len;
198  }
199  }
200  }
201  }
202  }
203 
204  if (!error && stored_ali_len != found_ali_len) error = GB_write_int(gb_ali_len, found_ali_len);
205  if (!error) error = GBT_write_int(preset_alignment, "aligned", aligned);
206 
207  if (error) {
208  error = GBS_global_string("Error checking alignment '%s':\n%s\n", ali_name, error);
209  }
210  }
211 
212  free(ali_name);
213 
214  return error;
215 }
216 
218  /* alignment_name
219  * == 0 -> check all existing alignments
220  * otherwise -> check only one alignment
221  */
222 
223  GB_ERROR error = NULp;
224  GBDATA *gb_sd = GBT_get_species_data(Main);
225  GBDATA *gb_presets = GBT_get_presets(Main);
226  GB_HASH *species_name_hash = NULp;
227 
228  // create rest of main containers
229  GBT_get_SAI_data(Main);
230  GBT_get_tree_data(Main);
231 
232  if (alignment_name) {
233  GBDATA *gb_ali_name = GB_find_string(gb_presets, "alignment_name", alignment_name, GB_IGNORE_CASE, SEARCH_GRANDCHILD);
234  if (!gb_ali_name) {
235  error = GBS_global_string("Alignment '%s' does not exist - it can't be checked.", alignment_name);
236  }
237  }
238 
239  if (!error) {
240  // check whether we have an default alignment
241  GBDATA *gb_use = GB_entry(gb_presets, "use");
242  if (!gb_use) {
243  // if we have no default alignment -> look for any alignment
244  GBDATA *gb_ali_name = GB_find_string(gb_presets, "alignment_name", alignment_name, GB_IGNORE_CASE, SEARCH_GRANDCHILD);
245 
246  error = gb_ali_name ?
247  GBT_write_string(gb_presets, "use", GB_read_char_pntr(gb_ali_name)) :
248  "No alignment defined";
249  }
250  }
251 
252  if (!alignment_name && !error) {
253  // if all alignments are checked -> use species_name_hash to detect duplicated species and species w/o data
254  long duplicates = 0;
255  long unnamed = 0;
256 
257  species_name_hash = GBS_create_hash(GBT_get_species_count(Main), GB_IGNORE_CASE);
258 
259  if (!error) {
260  for (GBDATA *gb_species = GBT_first_species_rel_species_data(gb_sd);
261  gb_species;
262  gb_species = GBT_next_species(gb_species))
263  {
264  const char *name = GBT_get_name(gb_species);
265  if (name) {
266  if (GBS_read_hash(species_name_hash, name)) duplicates++;
267  GBS_incr_hash(species_name_hash, name);
268  }
269  else {
270  unnamed++;
271  }
272  }
273  }
274 
275  if (duplicates) {
276  error = GBS_global_string("Database is corrupted:\n"
277  "Found %li duplicated species with identical names!\n"
278  "Fix the problem using\n"
279  " 'Search For Equal Fields and Mark Duplicates'\n"
280  "in ARB_NTREE search tool, save DB and restart ARB.",
281  duplicates);
282  }
283  else if (unnamed) {
284  error = GBS_global_string("Database is corrupted:\n"
285  "Found %li species without IDs ('name')!\n"
286  "This is a sewere problem!\n"
287  "ARB will not work as expected!",
288  unnamed);
289  }
290  }
291 
292  if (!error) {
293  for (GBDATA *gb_ali = GB_entry(gb_presets, "alignment");
294  gb_ali && !error;
295  gb_ali = GB_nextEntry(gb_ali))
296  {
297  error = GBT_check_alignment(Main, gb_ali, species_name_hash);
298  }
299  }
300 
301  if (species_name_hash) {
302  if (!error) {
303  long counter = 0;
304  GBS_hash_do_const_loop(species_name_hash, check_for_species_without_data, &counter);
305  if (counter>0) {
306  GB_warningf("Found %li species without alignment data (only some were listed)", counter);
307  }
308  }
309 
310  GBS_free_hash(species_name_hash);
311  }
312 
313  return error;
314 }
315 
317  /* Get names of existing alignments from database.
318  *
319  * Returns: array of strings, the last element is NULp
320  * (Note: use GBT_free_names() to free result)
321  */
322 
323  GBDATA *presets = GBT_get_presets(gbd);
324  for (GBDATA *ali = GB_entry(presets, "alignment"); ali; ali = GB_nextEntry(ali)) {
325  GBDATA *name = GB_entry(ali, "alignment_name");
326  names.put(name ? GB_read_char_pntr(name) : "<unnamed alignment>");
327  }
328 }
329 
330 static char *gbt_nonexisting_alignment(GBDATA *gbMain) {
331  char *ali_other = NULp;
332  int counter;
333 
334  for (counter = 1; !ali_other; ++counter) {
335  ali_other = GBS_global_string_copy("ali_x%i", counter);
336  if (GBT_get_alignment(gbMain, ali_other)) freenull(ali_other); // exists -> continue
337  }
338 
339  GB_clear_error(); // from GBT_get_alignment
340  return ali_other;
341 }
342 
344  GB_ERROR error = GB_check_key(alignment_name);
345  if (!error && !ARB_strBeginsWith(alignment_name, "ali_")) {
346  error = GBS_global_string("alignment name '%s' has to start with 'ali_'", alignment_name);
347  }
348  return error;
349 }
350 
351 static GB_ERROR create_ali_strEntry(GBDATA *gb_ali, const char *field, const char *strval, long write_protection) {
352  GB_ERROR error = NULp;
353  GBDATA *gb_sub = GB_create(gb_ali, field, GB_STRING);
354 
355  if (!gb_sub) error = GB_await_error();
356  else {
357  error = GB_write_string(gb_sub, strval);
358  if (!error) error = GB_write_security_delete(gb_sub, 7);
359  if (!error) error = GB_write_security_write(gb_sub, write_protection);
360  }
361 
362  if (error) {
363  error = GBS_global_string("failed to create alignment subentry '%s'\n"
364  "(Reason: %s)", field, error);
365  }
366 
367  return error;
368 }
369 static GB_ERROR create_ali_intEntry(GBDATA *gb_ali, const char *field, int intval, long write_protection) {
370  GB_ERROR error = NULp;
371  GBDATA *gb_sub = GB_create(gb_ali, field, GB_INT);
372 
373  if (!gb_sub) error = GB_await_error();
374  else {
375  error = GB_write_int(gb_sub, intval);
376  if (!error) error = GB_write_security_delete(gb_sub, 7);
377  if (!error) error = GB_write_security_write(gb_sub, write_protection);
378  }
379 
380  if (error) {
381  error = GBS_global_string("failed to create alignment subentry '%s'\n"
382  "(Reason: %s)", field, error);
383  }
384 
385  return error;
386 }
387 
388 GBDATA *GBT_create_alignment(GBDATA *gb_main, const char *name, long len, long aligned, long security, const char *type) {
389  /* create alignment
390  *
391  * returns pointer to alignment or
392  * NULp (in this case an error has been exported)
393  */
394  GB_ERROR error = NULp;
395  GBDATA *gb_presets = GBT_get_presets(gb_main);
396  GBDATA *result = NULp;
397 
398  if (!gb_presets) {
399  error = GBS_global_string("can't find/create 'presets' (Reason: %s)", GB_await_error());
400  }
401  else {
402  error = GBT_check_alignment_name(name);
403  if (!error && (security<0 || security>6)) {
404  error = GBS_global_string("Illegal security value %li (allowed 0..6)", security);
405  }
406  if (!error) {
407  const char *allowed_types = ":dna:rna:ami:usr:";
408  int tlen = strlen(type);
409  const char *found = strstr(allowed_types, type);
410  if (!found || found == allowed_types || found[-1] != ':' || found[tlen] != ':') {
411  error = GBS_global_string("Invalid alignment type '%s'", type);
412  }
413  }
414 
415  if (!error) {
416  GBDATA *gb_name = GB_find_string(gb_presets, "alignment_name", name, GB_IGNORE_CASE, SEARCH_GRANDCHILD);
417 
418  if (gb_name) error = GBS_global_string("Alignment '%s' already exists", name);
419  else {
420  GBDATA *gb_ali = GB_create_container(gb_presets, "alignment");
421  if (!gb_ali) error = GB_await_error();
422  else {
423  error = GB_write_security_delete(gb_ali, 6);
424  if (!error) error = create_ali_strEntry(gb_ali, "alignment_name", name, 6);
425  if (!error) error = create_ali_intEntry(gb_ali, "alignment_len", len, 0);
426  if (!error) error = create_ali_intEntry(gb_ali, "aligned", aligned <= 0 ? 0 : 1, 0);
427  if (!error) error = create_ali_intEntry(gb_ali, "alignment_write_security", security, 6);
428  if (!error) error = create_ali_strEntry(gb_ali, "alignment_type", type, 0);
429  }
430 
431  if (!error) result = gb_ali;
432  }
433  }
434  }
435 
436  if (!result) {
437  gb_assert(error);
438  GB_export_errorf("in GBT_create_alignment: %s", error);
439  }
440 #if defined(DEBUG)
441  else gb_assert(!error);
442 #endif // DEBUG
443 
444  return result;
445 }
446 
447 
448 static GB_ERROR gbt_rename_alignment_of_item(GBDATA *gb_item_container, const char *item_name, const char *item_entry_name,
449  const char *source, const char *dest, bool copy, bool delete_src)
450 {
451  GB_ERROR error = NULp;
452  GBDATA *gb_item;
453 
454  for (gb_item = GB_entry(gb_item_container, item_entry_name);
455  gb_item && !error;
456  gb_item = GB_nextEntry(gb_item))
457  {
458  GBDATA *gb_ali = GB_entry(gb_item, source);
459  if (!gb_ali) continue;
460 
461  if (copy) {
462  GBDATA *gb_new = GB_entry(gb_item, dest);
463  if (gb_new) {
464  error = GBS_global_string("Entry '%s' already exists", dest);
465  }
466  else {
467  gb_new = GB_create_container(gb_item, dest);
468  if (!gb_new) error = GB_await_error();
469  else error = GB_copy_dropProtectMarksAndTempstate(gb_new, gb_ali);
470  }
471  }
472  if (delete_src) error = GB_delete(gb_ali);
473  }
474 
475  if (error && gb_item) {
476  error = GBS_global_string("%s\n(while renaming alignment for %s '%s')", error, item_name, GBT_get_name_or_description(gb_item));
477  }
478 
479  return error;
480 }
481 
482 GB_ERROR GBT_rename_alignment(GBDATA *gbMain, const char *source, const char *dest, bool copy, bool delete_src) {
483  /* if 'copy' is true -> create a copy
484  * if 'delete_src' is true -> delete source alignment
485  */
486 
487  gb_assert(!GB_have_error()); // illegal to enter this function when an error is exported!
488 
489  GB_ERROR error = NULp;
490  int is_case_error = 0;
491  GBDATA *gb_presets = GBT_get_presets(gbMain);
493  GBDATA *gb_extended_data = GBT_get_SAI_data(gbMain);
494 
495  if (!gb_presets || !gb_species_data || !gb_extended_data) error = GB_await_error();
496 
497  // create copy and/or delete old alignment description
498  if (!error) {
499  GBDATA *gb_old_alignment = GBT_get_alignment(gbMain, source);
500 
501  if (!gb_old_alignment) {
502  error = GB_await_error();
503  }
504  else {
505  if (copy) {
506  GBDATA *gbh = GBT_get_alignment(gbMain, dest);
507  if (gbh) {
508  error = GBS_global_string("destination alignment '%s' already exists", dest);
509  is_case_error = (strcasecmp(source, dest) == 0); // test for case-only difference
510  }
511  else {
512  GB_clear_error();
513  error = GBT_check_alignment_name(dest);
514  if (!error) {
515  GBDATA *gb_new_alignment = GB_create_container(gb_presets, "alignment");
516  error = GB_copy_dropProtectMarksAndTempstate(gb_new_alignment, gb_old_alignment);
517  if (!error) error = GBT_write_string(gb_new_alignment, "alignment_name", dest);
518  }
519  }
520  }
521 
522  if (delete_src && !error) {
523  error = GB_delete(gb_old_alignment);
524  }
525  }
526  }
527 
528  // change default alignment
529  if (!error && delete_src && copy) {
530  error = GBT_write_string(gb_presets, "use", dest);
531  }
532 
533  // copy and/or delete alignment entries in species
534  if (!error) {
535  error = gbt_rename_alignment_of_item(gb_species_data, "Species", "species", source, dest, copy, delete_src);
536  }
537 
538  // copy and/or delete alignment entries in SAIs
539  if (!error) {
540  error = gbt_rename_alignment_of_item(gb_extended_data, "SAI", "extended", source, dest, copy, delete_src);
541  }
542 
543  if (is_case_error) {
544  if (delete_src == 0) {
545  error = "Cannot copy alignment if destination name only differs in case.";
546  }
547  else {
549 
550  // alignments source and dest only differ in case
551  char *ali_other = gbt_nonexisting_alignment(gbMain);
552 
553  gb_assert(copy);
555 
556  printf("Renaming alignment '%s' -> '%s' -> '%s' (to avoid case-problem)\n", source, ali_other, dest);
557 
558  error = GBT_rename_alignment(gbMain, source, ali_other, true, delete_src);
559  if (!error) error = GBT_rename_alignment(gbMain, ali_other, dest, true, true);
560 
561  free(ali_other);
562  }
563  }
564 
565  return error;
566 }
567 
568 // -----------------------------------------
569 // alignment related item functions
570 
571 NOT4PERL GBDATA *GBT_add_data(GBDATA *species, const char *ali_name, const char *key, GB_TYPES type) {
572  // goes to header: __ATTR__DEPRECATED_TODO("better use GBT_create_sequence_data()")
573 
574  /* replace this function by GBT_create_sequence_data
575  * the same as GB_search(species, 'ali_name/key', GB_CREATE)
576  *
577  * Note: The behavior is weird, cause it does sth special for GB_STRING (write default content "...")
578  *
579  * returns create database entry (or NULp; exports an error in this case)
580  */
581 
582  GB_ERROR error = GB_check_key(ali_name);
583  if (error) {
584  error = GBS_global_string("Invalid alignment name '%s' (Reason: %s)", ali_name, error);
585  }
586  else {
587  error = GB_check_hkey(key);
588  if (error) {
589  error = GBS_global_string("Invalid field name '%s' (Reason: %s)", key, error);
590  }
591  }
592 
593  GBDATA *gb_data = NULp;
594  if (error) {
595  GB_export_error(error);
596  }
597  else {
598  GBDATA *gb_gb = GB_entry(species, ali_name);
599  if (!gb_gb) gb_gb = GB_create_container(species, ali_name);
600 
601  if (gb_gb) {
602  if (type == GB_STRING) {
603  gb_data = GB_search(gb_gb, key, GB_FIND);
604  if (!gb_data) {
605  gb_data = GB_search(gb_gb, key, GB_STRING);
606  GB_write_string(gb_data, "...");
607  }
608  }
609  else {
610  gb_data = GB_search(gb_gb, key, type);
611  }
612  }
613  }
614  return gb_data;
615 }
616 
617 NOT4PERL GBDATA *GBT_create_sequence_data(GBDATA *species, const char *ali_name, const char *key, GB_TYPES type, int security_write) {
618  GBDATA *gb_data = GBT_add_data(species, ali_name, key, type);
619  if (gb_data) {
620  GB_ERROR error = GB_write_security_write(gb_data, security_write);
621  if (error) {
622  GB_export_error(error);
623  gb_data = NULp;
624  }
625  }
626  return gb_data;
627 }
628 
629 GBDATA *GBT_gen_accession_number(GBDATA *gb_species, const char *ali_name) {
630  GBDATA *gb_acc = GB_entry(gb_species, "acc");
631  if (!gb_acc) {
632  GBDATA *gb_data = GBT_find_sequence(gb_species, ali_name);
633  if (gb_data) { // found a valid alignment
634  GB_CSTR sequence = GB_read_char_pntr(gb_data);
635  long id = GBS_checksum(sequence, 1, ".-");
636  const char *acc = GBS_global_string("ARB_%lX", id);
637  GB_ERROR error = GBT_write_string(gb_species, "acc", acc);
638 
639  if (error) GB_export_error(error);
640  }
641  }
642  return gb_acc;
643 }
644 
645 
646 int GBT_is_partial(GBDATA *gb_species, int default_value, bool define_if_undef) {
647  // checks whether a species has a partial or full sequence
648  //
649  // Note: partial sequences should not be used for tree calculations
650  //
651  // returns: 0 if sequence is full
652  // 1 if sequence is partial
653  // -1 in case of error (which is exported in this case)
654  //
655  // if the sequence has no 'ARB_partial' entry it returns 'default_value'
656  // if 'define_if_undef' is true then create an 'ARB_partial'-entry with the default value
657 
658  int result = -1;
659  GB_ERROR error = NULp;
660  GBDATA *gb_partial = GB_entry(gb_species, "ARB_partial");
661 
662  if (gb_partial) {
663  result = GB_read_int(gb_partial);
664  if (result != 0 && result != 1) {
665  error = "Illegal value for 'ARB_partial' (only 1 or 0 allowed)";
666  }
667  }
668  else {
669  if (define_if_undef) {
670  error = GBT_write_int(gb_species, "ARB_partial", default_value);
671  }
672  result = default_value;
673  }
674 
675  if (error) {
676  GB_export_error(error);
677  return -1;
678  }
679  return result;
680 }
681 
682 GBDATA *GBT_find_sequence(GBDATA *gb_species, const char *aliname) {
683  GBDATA *gb_ali = GB_entry(gb_species, aliname);
684  return gb_ali ? GB_entry(gb_ali, "data") : NULp;
685 }
686 
688  gb_assert(!GB_have_error()); // illegal to enter this function when an error is exported!
689  return GBT_read_string(gb_main, GB_DEFAULT_ALIGNMENT);
690 }
691 
693  return GBT_write_string(gb_main, GB_DEFAULT_ALIGNMENT, alignment_name);
694 }
695 
696 GBDATA *GBT_get_alignment(GBDATA *gb_main, const char *aliname) {
700  if (!aliname) {
701  GB_export_error("no alignment given");
702  return NULp;
703  }
704 
705  GBDATA *gb_presets = GBT_get_presets(gb_main);
706  GBDATA *gb_alignment_name = GB_find_string(gb_presets, "alignment_name", aliname, GB_IGNORE_CASE, SEARCH_GRANDCHILD);
707 
708  if (!gb_alignment_name) {
709  GB_export_errorf("alignment '%s' not found", aliname);
710  return NULp;
711  }
712  return GB_get_father(gb_alignment_name);
713 }
714 
715 // @@@ recode and change result type to long* ?
716 long GBT_get_alignment_len(GBDATA *gb_main, const char *aliname) {
720  GBDATA *gb_alignment = GBT_get_alignment(gb_main, aliname);
721  return gb_alignment ? *GBT_read_int(gb_alignment, "alignment_len") : -1;
722 }
723 
724 GB_ERROR GBT_set_alignment_len(GBDATA *gb_main, const char *aliname, long new_len) {
725  GB_ERROR error = NULp;
726  GBDATA *gb_alignment = GBT_get_alignment(gb_main, aliname);
727 
728  if (gb_alignment) {
729  GB_topSecurityLevel unsecured(gb_main);
730  error = GBT_write_int(gb_alignment, "alignment_len", new_len); // write new len
731  if (!error) error = GBT_write_int(gb_alignment, "aligned", 0); // mark as unaligned
732  }
733  else error = GB_export_errorf("Alignment '%s' not found", aliname);
734  return error;
735 }
736 
737 char *GBT_get_alignment_type_string(GBDATA *gb_main, const char *aliname) {
741  char *result = NULp;
742  GBDATA *gb_alignment = GBT_get_alignment(gb_main, aliname);
743  if (gb_alignment) {
744  result = GBT_read_string(gb_alignment, "alignment_type");
745  gb_assert(result);
746  }
747  return result;
748 }
749 
751  char *ali_type = GBT_get_alignment_type_string(gb_main, aliname);
753 
754  if (ali_type) {
755  switch (ali_type[0]) {
756  case 'r': if (strcmp(ali_type, "rna")==0) at = GB_AT_RNA; break;
757  case 'd': if (strcmp(ali_type, "dna")==0) at = GB_AT_DNA; break;
758  case 'a': if (strcmp(ali_type, "ami")==0) at = GB_AT_AA; break;
759  case 'p': if (strcmp(ali_type, "pro")==0) at = GB_AT_AA; break;
760  default: gb_assert(0); break;
761  }
762  free(ali_type);
763  }
764  return at;
765 }
766 
768  return GBT_get_alignment_type(gb_main, alignment_name) == GB_AT_AA;
769 }
770 
771 // -----------------------
772 // gene sequence
773 
774 static const char *gb_cache_genome(GBDATA *gb_genome) {
775  static GBDATA *gb_last_genome = NULp;
776  static char *last_genome = NULp;
777 
778  if (gb_genome != gb_last_genome) {
779  free(last_genome);
780 
781  last_genome = GB_read_string(gb_genome);
782  gb_last_genome = gb_genome;
783  }
784 
785  return last_genome;
786 }
787 
789  int parts; // initialized for parts
790  unsigned char *certain; // contains parts "=" chars
791  char offset[256];
792 };
793 
795 
796 static void init_gpp(int parts) {
797  if (!gpp) {
798  int i;
799  ARB_alloc(gpp, 1);
800  gpp->certain = NULp;
801 
802  for (i = 0; i<256; ++i) gpp->offset[i] = 0;
803 
804  gpp->offset[(int)'+'] = 1;
805  gpp->offset[(int)'-'] = -1;
806  }
807  else {
808  if (parts>gpp->parts) freenull(gpp->certain);
809  }
810 
811  if (!gpp->certain) {
812  int forParts = parts+10;
813  ARB_alloc(gpp->certain, forParts+1);
814  memset(gpp->certain, '=', forParts);
815  gpp->certain[forParts] = 0;
816  gpp->parts = forParts;
817  }
818 }
819 
820 static void getPartPositions(const GEN_position *pos, int part, size_t *startPos, size_t *stopPos) {
821  // returns 'startPos' and 'stopPos' of one part of a gene
822  gb_assert(part<pos->parts);
823  *startPos = pos->start_pos[part]+gpp->offset[(pos->start_uncertain ? pos->start_uncertain : gpp->certain)[part]];
824  *stopPos = pos->stop_pos [part]+gpp->offset[(pos->stop_uncertain ? pos->stop_uncertain : gpp->certain)[part]];
825 }
826 
827 NOT4PERL char *GBT_read_gene_sequence_and_length(GBDATA *gb_gene, bool use_revComplement, char partSeparator, size_t *gene_length) {
828  // return the sequence data of a gene
829  //
830  // if use_revComplement is true -> use data from complementary strand (if complement is set for gene)
831  // otherwise -> use data from primary strand (sort+merge parts by position)
832  //
833  // if partSeparator not is 0 -> insert partSeparator between single (non-merged) parts
834  //
835  // returns sequence as result (and length of sequence if 'gene_length' points to something)
836  //
837  // if 'pos_certain' contains '+', start behind position (or end at position)
838  // '-', start at position (or end before position)
839  //
840  // For zero-length genes (e.g. "711^712") this function returns an empty string.
841 
842  GB_ERROR error = NULp;
843  char *result = NULp;
844  GBDATA *gb_species = GB_get_grandfather(gb_gene);
845  GEN_position *pos = GEN_read_position(gb_gene);
846 
847  if (!pos) error = GB_await_error();
848  else {
849  GBDATA *gb_seq = GBT_find_sequence(gb_species, "ali_genom");
850  unsigned long seq_length = GB_read_count(gb_seq);
851  int p;
852  int parts = pos->parts;
853  int resultlen = 0;
854  int separatorSize = partSeparator ? 1 : 0;
855 
856  init_gpp(parts);
857 
858  // test positions and calculate overall result length
859  for (p = 0; p<parts && !error; p++) {
860  size_t start;
861  size_t stop;
862  getPartPositions(pos, p, &start, &stop);
863 
864  if (start<1 || start>(stop+1) || stop > seq_length) { // do not reject zero-length genes (start == stop+1)
865  error = GBS_global_string("Illegal gene position(s): start=%zu, end=%zu, seq.length=%li",
866  start, stop, seq_length);
867  }
868  else {
869  resultlen += stop-start+1;
870  }
871  }
872 
873  if (separatorSize) resultlen += (parts-1)*separatorSize;
874 
875  if (!error) {
876  char T_or_U = 0;
877  if (use_revComplement) {
878  error = GBT_determine_T_or_U(GB_AT_DNA, &T_or_U, "reverse-complement");
879  }
880  else if (parts>1) {
882  parts = pos->parts; // may have changed
883  }
884 
885  if (!error) {
886  const char *seq_data = gb_cache_genome(gb_seq);
887  char *resultpos;
888 
889  ARB_alloc(result, resultlen+1);
890  resultpos = result;
891 
892  if (gene_length) *gene_length = resultlen;
893 
894  for (p = 0; p<parts; ++p) {
895  size_t start;
896  size_t stop;
897  getPartPositions(pos, p, &start, &stop);
898 
899  int len = stop-start+1;
900 
901  if (separatorSize && p>0) *resultpos++ = partSeparator;
902 
903  memcpy(resultpos, seq_data+start-1, len);
904  if (T_or_U && pos->complement[p]) {
905  GBT_reverseComplementNucSequence(resultpos, len, T_or_U);
906  }
907  resultpos += len;
908  }
909 
910  resultpos[0] = 0;
911  }
912  }
913  GEN_free_position(pos);
914  }
915 
916  gb_assert(result || error);
917 
918  if (error) {
919  char *id = GEN_global_gene_identifier(gb_gene, gb_species);
920  error = GB_export_errorf("Can't read sequence of '%s' (Reason: %s)", id, error);
921  free(id);
922  free(result);
923  result = NULp;
924  }
925 
926  return result;
927 }
928 
929 char *GBT_read_gene_sequence(GBDATA *gb_gene, bool use_revComplement, char partSeparator) {
930  return GBT_read_gene_sequence_and_length(gb_gene, use_revComplement, partSeparator, NULp);
931 }
932 
933 // --------------------------------------------------------------------------------
934 
935 #ifdef UNIT_TESTS
936 #include <test_unit.h>
937 
938 void TEST_alignment() {
939  GB_shell shell;
940  GBDATA *gb_main = GB_open("TEST_prot.arb", "r");
941 
942  {
943  GB_transaction ta(gb_main);
944 
946 
947  char *def_ali_name = GBT_get_default_alignment(gb_main);
948  TEST_EXPECT_EQUAL(def_ali_name, "ali_tuf_dna");
949 
950  {
952  GBT_get_alignment_names(names, gb_main);
953  {
954  char *joined = GBT_join_strings(names, '*');
955  TEST_EXPECT_EQUAL(joined, "ali_tuf_pro*ali_tuf_dna");
956  free(joined);
957  }
958 
959  for (int i = 0; names[i]; ++i) {
960  long len = GBT_get_alignment_len(gb_main, names[i]);
961  TEST_EXPECT_EQUAL(len, i ? 1462 : 487);
962 
963  char *type_name = GBT_get_alignment_type_string(gb_main, names[i]);
964  TEST_EXPECT_EQUAL(type_name, i ? "dna" : "ami");
965  free(type_name);
966 
967  GB_alignment_type type = GBT_get_alignment_type(gb_main, names[i]);
968  TEST_EXPECT_EQUAL(type, i ? GB_AT_DNA : GB_AT_AA);
969  TEST_EXPECT_EQUAL(GBT_is_alignment_protein(gb_main, names[i]), !i);
970  }
971  }
972 
973  // test functions called with aliname==NULp
974  TEST_EXPECT_NORESULT__ERROREXPORTED_CONTAINS(GBT_get_alignment(gb_main, NULp), "no alignment");
976  TEST_EXPECT_CONTAINS(GB_await_error(), "no alignment");
977  TEST_EXPECT_NORESULT__ERROREXPORTED_CONTAINS(GBT_get_alignment_type_string(gb_main, NULp), "no alignment");
978 
979  free(def_ali_name);
980  }
981 
982  GB_close(gb_main);
983 }
984 TEST_PUBLISH(TEST_alignment);
985 
986 #endif // UNIT_TESTS
GB_ERROR GB_check_key(const char *key) __ATTR__USERESULT
Definition: adstring.cxx:85
GB_ERROR GB_copy_dropProtectMarksAndTempstate(GBDATA *dest, GBDATA *source)
Definition: arbdb.cxx:2152
long GBT_get_alignment_len(GBDATA *gb_main, const char *aliname)
Definition: adali.cxx:716
const char * GB_ERROR
Definition: arb_core.h:25
string result
GBDATA * GB_open(const char *path, const char *opent)
Definition: ad_load.cxx:1363
GB_TYPES type
unsigned char * complement
Definition: adGene.h:41
void put(const char *elem)
Definition: arb_strarray.h:199
long GB_read_int(GBDATA *gbd)
Definition: arbdb.cxx:729
GBDATA * GB_child(GBDATA *father)
Definition: adquery.cxx:322
static void init_gpp(int parts)
Definition: adali.cxx:796
long GBS_write_hash(GB_HASH *hs, const char *key, long val)
Definition: adhash.cxx:457
Definition: arbdb.h:69
#define GB_DEFAULT_ALIGNMENT
Definition: arbdb.h:30
GB_ERROR GB_write_string(GBDATA *gbd, const char *s)
Definition: arbdb.cxx:1387
char * GBT_read_gene_sequence(GBDATA *gb_gene, bool use_revComplement, char partSeparator)
Definition: adali.cxx:929
void GEN_free_position(GEN_position *pos)
Definition: adGene.cxx:195
long GBS_incr_hash(GB_HASH *hs, const char *key)
Definition: adhash.cxx:470
static void getPartPositions(const GEN_position *pos, int part, size_t *startPos, size_t *stopPos)
Definition: adali.cxx:820
GBDATA * GB_nextEntry(GBDATA *entry)
Definition: adquery.cxx:339
static const char * gb_cache_genome(GBDATA *gb_genome)
Definition: adali.cxx:774
int parts
Definition: adGene.h:37
static GB_ERROR create_ali_strEntry(GBDATA *gb_ali, const char *field, const char *strval, long write_protection)
Definition: adali.cxx:351
static GB_ERROR gbt_rename_alignment_of_item(GBDATA *gb_item_container, const char *item_name, const char *item_entry_name, const char *source, const char *dest, bool copy, bool delete_src)
Definition: adali.cxx:448
const char * GBS_global_string(const char *templat,...)
Definition: arb_msg.cxx:202
static char * alignment_name
bool GB_have_error()
Definition: arb_msg.cxx:337
void GEN_sortAndMergeLocationParts(GEN_position *location)
Definition: adGene.cxx:497
void GBS_free_hash(GB_HASH *hs)
Definition: adhash.cxx:541
GBDATA * GBT_get_tree_data(GBDATA *gb_main)
Definition: adtree.cxx:26
GBDATA * GB_get_grandfather(GBDATA *gbd)
Definition: arbdb.cxx:1728
GBDATA * GB_get_father(GBDATA *gbd)
Definition: arbdb.cxx:1722
static char * gbt_nonexisting_alignment(GBDATA *gbMain)
Definition: adali.cxx:330
GBDATA * GBT_get_alignment(GBDATA *gb_main, const char *aliname)
Definition: adali.cxx:696
#define NOT4PERL
Definition: arbdb_base.h:23
GB_ERROR GB_delete(GBDATA *&source)
Definition: arbdb.cxx:1916
GBDATA * GBT_first_species_rel_species_data(GBDATA *gb_species_data)
Definition: aditem.cxx:121
static HelixNrInfo * start
GB_ERROR GB_check_hkey(const char *key) __ATTR__USERESULT
Definition: adstring.cxx:92
static FullNameMap names
#define TEST_PUBLISH(testfunction)
Definition: test_unit.h:1485
#define TEST_EXPECT_CONTAINS(str, part)
Definition: test_unit.h:1301
GB_ERROR GB_export_error(const char *error)
Definition: arb_msg.cxx:256
size_t GB_read_string_count(GBDATA *gbd)
Definition: arbdb.cxx:916
GB_ERROR GB_await_error()
Definition: arb_msg.cxx:341
GB_alignment_type GBT_get_alignment_type(GBDATA *gb_main, const char *aliname)
Definition: adali.cxx:750
NOT4PERL long * GBT_read_int(GBDATA *gb_container, const char *fieldpath)
Definition: adtools.cxx:327
GBDATA * GB_create_container(GBDATA *father, const char *key)
Definition: arbdb.cxx:1829
long GB_read_count(GBDATA *gbd)
Definition: arbdb.cxx:758
char * GBT_read_string(GBDATA *gb_container, const char *fieldpath)
Definition: adtools.cxx:267
Definition: arbdb.h:78
GB_TYPES GB_read_type(GBDATA *gbd)
Definition: arbdb.cxx:1643
void GB_warningf(const char *templat,...)
Definition: arb_msg.cxx:477
TYPE * ARB_alloc(size_t nelem)
Definition: arb_mem.h:56
GB_CSTR GB_read_key_pntr(GBDATA *gbd)
Definition: arbdb.cxx:1656
NOT4PERL void GBT_reverseComplementNucSequence(char *seq, long length, char T_or_U)
Definition: adRevCompl.cxx:102
GBDATA * GB_create(GBDATA *father, const char *key, GB_TYPES type)
Definition: arbdb.cxx:1781
GBDATA * GBT_get_presets(GBDATA *gb_main)
Definition: adali.cxx:29
GBDATA * gb_species_data
Definition: adname.cxx:34
unsigned char * start_uncertain
Definition: adGene.h:53
void GBS_hash_do_const_loop(const GB_HASH *hs, gb_hash_const_loop_type func, void *client_data)
Definition: adhash.cxx:562
void GB_clear_error()
Definition: arb_msg.cxx:353
NOT4PERL char * GBT_read_gene_sequence_and_length(GBDATA *gb_gene, bool use_revComplement, char partSeparator, size_t *gene_length)
Definition: adali.cxx:827
static void check_for_species_without_data(const char *species_name, long value, void *counterPtr)
Definition: adali.cxx:19
char * GBT_create_unique_species_name(GBDATA *gb_main, const char *default_name)
Definition: aditem.cxx:287
size_t * stop_pos
Definition: adGene.h:40
static void error(const char *msg)
Definition: mkptypes.cxx:96
char * GBT_get_alignment_type_string(GBDATA *gb_main, const char *aliname)
Definition: adali.cxx:737
GBDATA * GBT_first_SAI_rel_SAI_data(GBDATA *gb_sai_data)
Definition: aditem.cxx:159
char * GBT_get_default_alignment(GBDATA *gb_main)
Definition: adali.cxx:687
GBDATA * GBT_create_alignment(GBDATA *gb_main, const char *name, long len, long aligned, long security, const char *type)
Definition: adali.cxx:388
GB_ERROR GB_write_security_write(GBDATA *gbd, unsigned long level)
Definition: arbdb.cxx:1584
GB_ERROR GBT_rename_alignment(GBDATA *gbMain, const char *source, const char *dest, bool copy, bool delete_src)
Definition: adali.cxx:482
Definition: arbdb.h:86
NOT4PERL GBDATA * GBT_add_data(GBDATA *species, const char *ali_name, const char *key, GB_TYPES type)
Definition: adali.cxx:571
GB_ERROR GB_write_int(GBDATA *gbd, long i)
Definition: arbdb.cxx:1250
GB_alignment_type
Definition: arbdb_base.h:61
GB_ERROR GBT_set_alignment_len(GBDATA *gb_main, const char *aliname, long new_len)
Definition: adali.cxx:724
int GBT_count_alignments(GBDATA *gb_main)
Definition: adali.cxx:33
int GBT_is_partial(GBDATA *gb_species, int default_value, bool define_if_undef)
Definition: adali.cxx:646
char * GBT_join_strings(const CharPtrArray &strings, char separator)
GBDATA * GBT_next_SAI(GBDATA *gb_sai)
Definition: aditem.cxx:166
static GB_ERROR GBT_check_alignment(GBDATA *gb_main, GBDATA *preset_alignment, GB_HASH *species_name_hash)
Definition: adali.cxx:45
GB_ERROR GB_export_errorf(const char *templat,...)
Definition: arb_msg.cxx:261
bool GBT_is_alignment_protein(GBDATA *gb_main, const char *alignment_name)
Definition: adali.cxx:767
static void copy(double **i, double **j)
Definition: trnsprob.cxx:32
char * GEN_global_gene_identifier(GBDATA *gb_gene, GBDATA *gb_organism)
Definition: adGene.cxx:793
#define gb_assert(cond)
Definition: arbdbt.h:11
static gene_part_pos * gpp
Definition: adali.cxx:794
GB_ERROR GBT_set_default_alignment(GBDATA *gb_main, const char *alignment_name)
Definition: adali.cxx:692
unsigned char * certain
Definition: adali.cxx:790
GB_ERROR GBT_write_string(GBDATA *gb_container, const char *fieldpath, const char *content)
Definition: adtools.cxx:451
char * GB_read_string(GBDATA *gbd)
Definition: arbdb.cxx:909
GBDATA * GBT_find_or_create(GBDATA *father, const char *key, long delete_level)
Definition: adtools.cxx:42
void GBT_get_alignment_names(ConstStrArray &names, GBDATA *gbd)
Definition: adali.cxx:316
GB_ERROR GB_write_security_delete(GBDATA *gbd, unsigned long level)
Definition: arbdb.cxx:1605
uint32_t GBS_checksum(const char *seq, int ignore_case, const char *exclude)
Definition: adstring.cxx:353
GBDATA * GBT_find_sequence(GBDATA *gb_species, const char *aliname)
Definition: adali.cxx:682
size_t * start_pos
Definition: adGene.h:39
GBDATA * GB_find_string(GBDATA *gbd, const char *key, const char *str, GB_CASE case_sens, GB_SEARCH_TYPE gbs)
Definition: adquery.cxx:302
GB_ERROR GBT_check_alignment_name(const char *alignment_name)
Definition: adali.cxx:343
GBDATA * GBT_next_species(GBDATA *gb_species)
Definition: aditem.cxx:128
#define NULp
Definition: cxxforward.h:114
GB_ERROR GBT_check_data(GBDATA *Main, const char *alignment_name)
Definition: adali.cxx:217
bool ARB_strBeginsWith(const char *str, const char *with)
Definition: arb_str.h:42
const char * GBT_get_name(GBDATA *gb_item)
Definition: aditem.cxx:469
GB_TYPES
Definition: arbdb.h:62
GBDATA * GB_nextChild(GBDATA *child)
Definition: adquery.cxx:326
char offset[256]
Definition: adali.cxx:791
long GBT_get_species_count(GBDATA *gb_main)
Definition: aditem.cxx:207
GB_transaction ta(gb_var)
GB_ERROR GBT_write_int(GBDATA *gb_container, const char *fieldpath, long content)
Definition: adtools.cxx:471
GB_CSTR GB_read_char_pntr(GBDATA *gbd)
Definition: arbdb.cxx:904
GBDATA * gb_main
Definition: adname.cxx:33
GBDATA * GBT_get_SAI_data(GBDATA *gb_main)
Definition: aditem.cxx:154
GBDATA * GB_search(GBDATA *gbd, const char *fieldpath, GB_TYPES create)
Definition: adquery.cxx:531
GB_CSTR GBT_get_name_or_description(GBDATA *gb_item)
Definition: aditem.cxx:460
NOT4PERL GBDATA * GBT_create_sequence_data(GBDATA *species, const char *ali_name, const char *key, GB_TYPES type, int security_write)
Definition: adali.cxx:617
GEN_position * GEN_read_position(GBDATA *gb_gene)
Definition: adGene.cxx:250
const char * GB_CSTR
Definition: arbdb_base.h:25
#define TEST_EXPECT_EQUAL(expr, want)
Definition: test_unit.h:1283
GBDATA * GBT_gen_accession_number(GBDATA *gb_species, const char *ali_name)
Definition: adali.cxx:629
long GBS_read_hash(const GB_HASH *hs, const char *key)
Definition: adhash.cxx:395
GBDATA * GB_entry(GBDATA *father, const char *key)
Definition: adquery.cxx:334
static GB_ERROR create_ali_intEntry(GBDATA *gb_ali, const char *field, int intval, long write_protection)
Definition: adali.cxx:369
char * GBS_global_string_copy(const char *templat,...)
Definition: arb_msg.cxx:193
void GB_close(GBDATA *gbd)
Definition: arbdb.cxx:655
NOT4PERL GB_ERROR GBT_determine_T_or_U(GB_alignment_type alignment_type, char *T_or_U, const char *supposed_target)
Definition: adRevCompl.cxx:90
GB_HASH * GBS_create_hash(long estimated_elements, GB_CASE case_sens)
Definition: adhash.cxx:253
unsigned char * stop_uncertain
Definition: adGene.h:54
Definition: arbdb.h:66
GBDATA * GBT_get_species_data(GBDATA *gb_main)
Definition: aditem.cxx:105