ARB
adGene.cxx
Go to the documentation of this file.
1 // =============================================================== //
2 // //
3 // File : adGene.cxx //
4 // Purpose : Basic gene access functions //
5 // //
6 // Coded by Ralf Westram (coder@reallysoft.de) in July 2002 //
7 // Institute of Microbiology (Technical University Munich) //
8 // http://www.arb-home.de/ //
9 // //
10 // =============================================================== //
11 
12 #include "gb_local.h"
13 
14 #include <adGene.h>
15 #include <arbdbt.h>
16 #include <arb_strbuf.h>
17 #include <arb_strarray.h>
18 
19 
20 bool GEN_is_genome_db(GBDATA *gb_main, int default_value) {
21  // default_value == 0 -> default to normal database
22  // == 1 -> default to GENOM database
23  // == -1 -> assume that type is already defined (is defined for all DBs ever loaded in ARB-gui)
24  GBDATA *gb_genom_db = GB_entry(gb_main, GENOM_DB_TYPE);
25 
26  if (!gb_genom_db) { // no DB-type entry -> create one with default
28 
29  assert_or_exit(default_value != -1); // first call to GEN_is_genome_db has to provide a 'default_value'
30 
31  gb_genom_db = GB_create(gb_main, GENOM_DB_TYPE, GB_INT);
32  if (!gb_genom_db) error = GB_await_error();
33  else error = GB_write_int(gb_genom_db, default_value);
34 
35  if (error) GBK_terminatef("Fatal in GEN_is_genome_db: %s", error);
36  }
37 
38  return GB_read_int(gb_genom_db) != 0;
39 }
40 
41 // --------------
42 // genes:
43 
45  GBDATA *gb_gene_data = GB_search(gb_species, "gene_data", GB_CREATE_CONTAINER);
46  gb_assert(gb_gene_data);
47  return gb_gene_data;
48 }
49 
51  return GB_search(gb_species, "gene_data", GB_FIND);
52 }
53 
55  GBDATA *gb_gene_data = GB_search(gb_species, "gene_data", GB_FIND);
56  gb_assert(gb_gene_data);
57  return gb_gene_data;
58 }
59 
60 GBDATA* GEN_find_gene_rel_gene_data(GBDATA *gb_gene_data, const char *name) {
61  return GBT_find_item_rel_item_data(gb_gene_data, "name", name);
62 }
63 
64 GBDATA* GEN_find_gene(GBDATA *gb_species, const char *name) {
65  // Search for a gene.
66  // Return found gene or NULp (in this case an error MAY be exported).
67  //
68  // Note: If you expect the gene to exists, use GEN_expect_gene!
69  GBDATA *gb_gene_data = GEN_find_gene_data(gb_species);
70  return gb_gene_data ? GEN_find_gene_rel_gene_data(gb_gene_data, name) : NULp;
71 }
72 GBDATA* GEN_expect_gene(GBDATA *gb_species, const char *name) {
73  // Returns an existing gene or
74  // NULp (in that case an error is exported)
75  return GBT_expect_item_rel_item_data(GEN_expect_gene_data(gb_species), "name", name);
76 }
77 
78 static GBDATA* GEN_create_nonexisting_gene_rel_gene_data(GBDATA *gb_gene_data, const char *name) {
79  GB_ERROR error = GB_push_transaction(gb_gene_data);
80  GBDATA *gb_gene = NULp;
81 
82  gb_assert(!GEN_find_gene_rel_gene_data(gb_gene_data, name)); // don't call this function if you are not sure that the gene does not exists!
83 
84  if (!error) {
85  gb_gene = GB_create_container(gb_gene_data, "gene");
86  error = gb_gene ? GBT_write_string(gb_gene, "name", name) : GB_await_error();
87  }
88 
89  gb_assert(gb_gene || error);
90  error = GB_end_transaction(gb_gene_data, error);
91  if (error) GB_export_error(error);
92 
93  return gb_gene;
94 }
95 
96 GBDATA* GEN_create_nonexisting_gene(GBDATA *gb_species, const char *name) { // needed by ../PERL_SCRIPTS/GENOME/GI.pm@create_nonexisting_gene
98 }
99 
100 GBDATA* GEN_find_or_create_gene_rel_gene_data(GBDATA *gb_gene_data, const char *name) {
101  GBDATA *gb_gene = NULp;
102 
103  // Search for a gene, when gene does not exist create it
104  if (!name || !name[0]) {
105  GB_export_error("Missing gene name");
106  }
107  else {
108  GBDATA *gb_name = GB_find_string(gb_gene_data, "name", name, GB_IGNORE_CASE, SEARCH_GRANDCHILD);
109 
110  if (gb_name) {
111  gb_gene = GB_get_father(gb_name); // found existing gene
112  }
113  else {
114  GB_ERROR error = GB_push_transaction(gb_gene_data);
115 
116  if (!error) {
117  gb_gene = GB_create_container(gb_gene_data, "gene");
118  error = GBT_write_string(gb_gene, "name", name);
119  }
120  error = GB_end_transaction(gb_gene_data, error);
121  if (error) {
122  gb_gene = NULp;
123  GB_export_error(error);
124  }
125  }
126  }
127  return gb_gene;
128 }
129 
130 GBDATA* GEN_first_gene(GBDATA *gb_species) {
131  return GB_entry(GEN_expect_gene_data(gb_species), "gene");
132 }
133 
135  return GB_entry(gb_gene_data, "gene");
136 }
137 
139  gb_assert(GB_has_key(gb_gene, "gene"));
140  return GB_nextEntry(gb_gene);
141 }
142 
144  return GB_first_marked(GEN_expect_gene_data(gb_species), "gene");
145 }
147  return GB_next_marked(gb_gene, "gene");
148 }
149 
150 // ----------------------
151 // gene position
152 
154 
155 GEN_position *GEN_new_position(int parts, bool joinable) {
156  GEN_position *pos;
157 
158  size_t pos_size = parts*sizeof(pos->start_pos[0]);
159  size_t comp_size = parts*sizeof(pos->complement[0]);
160  size_t data_size = 2*pos_size+3*comp_size;
161 
162  gb_assert(parts>0);
163 
164  if (lastFreedPosition && lastFreedPosition->parts == parts) {
165  pos = lastFreedPosition;
166  lastFreedPosition = NULp;
167  memset(pos->start_pos, 0, data_size);
168  }
169  else {
170  ARB_calloc(pos, 1);
171  pos->parts = parts;
172  pos->start_pos = (size_t*)ARB_calloc<char>(data_size);
173  pos->stop_pos = pos->start_pos+parts;
174  pos->complement = (unsigned char*)(pos->stop_pos+parts);
175  }
176 
177  pos->joinable = joinable;
178  pos->start_uncertain = NULp;
179  pos->stop_uncertain = NULp;
180 
181  return pos;
182 }
183 
185  if (!pos->start_uncertain) {
186  // space was already allocated in GEN_new_position
187  pos->start_uncertain = pos->complement+pos->parts;
188  pos->stop_uncertain = pos->start_uncertain+pos->parts;
189 
190  size_t comp_size = pos->parts*sizeof(pos->complement[0]);
191  memset(pos->start_uncertain, '=', 2*comp_size);
192  }
193 }
194 
196  if (pos) {
197  if (lastFreedPosition) {
198  free(lastFreedPosition->start_pos); // rest is allocated together with start_pos
199  free(lastFreedPosition);
200  }
201 
202  lastFreedPosition = pos;
203  }
204 }
205 
209 
210 
211 static GB_ERROR parseCSV(GBDATA *gb_gene, const char *field_name, size_t parts_expected, ConstStrArray& parseTable) {
212  // reads a field and splits the content at ','
213  // results are stored in parseTable
214 
215  GB_ERROR error = NULp;
216  GBDATA *gb_field = GB_entry(gb_gene, field_name);
217  if (!gb_field) error = GBS_global_string("Expected entry '%s' missing", field_name);
218  else {
219  char *content = GB_read_string(gb_field);
220  if (!content) error = GB_await_error();
221  else {
222  parseTable.erase();
223  GBT_splitNdestroy_string(parseTable, content, ',');
224  if (parseTable.size() != parts_expected) {
225  error = GBS_global_string("Expected %zu CSV, found %zu", parts_expected, parseTable.size());
226  }
227  }
228  }
229  return error;
230 }
231 
232 static GB_ERROR parsePositions(GBDATA *gb_gene, const char *field_name, int parts_expected, size_t *results, ConstStrArray& parseTable) {
233  GB_ERROR error = parseCSV(gb_gene, field_name, parts_expected, parseTable);
234  if (!error) {
235  int p;
236  for (p = 0; p<parts_expected && !error; p++) {
237  char *end;
238  results[p] = strtol(parseTable[p], &end, 10);
239  if (end == parseTable[p]) { // error
240  error = GBS_global_string("can't convert '%s' to number", parseTable[p]);
241  }
242  }
243  }
244  if (error) {
245  error = GBS_global_string("While parsing field '%s': %s", field_name, error);
246  }
247  return error;
248 }
249 
251  int parts = 1;
252  bool joinable = false;
253  GBDATA *gb_pos_joined = GB_entry(gb_gene, "pos_joined");
254  GEN_position *pos = NULp;
255  GB_ERROR error = NULp;
256 
257  if (gb_pos_joined) {
258  parts = GB_read_int(gb_pos_joined);
259  if (parts != 1) { // split
260  if (parts>1) joinable = true;
261  else if (parts<-1) parts = -parts; // neg value means "not joinable" (comes from feature location 'order(...)')
262  else error = GBS_global_string("Illegal value %i in 'pos_joined'", parts);
263  }
264  }
265 
266  if (!error) {
267  pos = GEN_new_position(parts, joinable);
268 
269  ConstStrArray parseTable;
270  parseTable.reserve(parts);
271 
272  error = parsePositions(gb_gene, "pos_start", parts, pos->start_pos, parseTable);
273  if (!error) error = parsePositions(gb_gene, "pos_stop", parts, pos->stop_pos, parseTable);
274 
275  int p;
276  if (!error) {
277  error = parseCSV(gb_gene, "pos_complement", parts, parseTable);
278  for (p = 0; p<parts && !error; p++) {
279  const char *val = parseTable[p];
280  if ((val[0] != '0' && val[0] != '1') || val[1] != 0) {
281  error = GBS_global_string("Invalid content '%s' in 'pos_complement' (expected: \"01\")", val);
282  }
283  else {
284  pos->complement[p] = (unsigned char)atoi(val);
285  }
286  }
287  }
288 
289  if (!error) {
290  GBDATA *gb_pos_certain = GB_entry(gb_gene, "pos_certain");
291 
292  if (gb_pos_certain) {
293  error = parseCSV(gb_gene, "pos_certain", parts, parseTable);
295  for (p = 0; p<parts && !error; p++) {
296  const unsigned char *val = (unsigned char *)(parseTable[p]);
297  int vp;
298 
299  for (vp = 0; vp<2; vp++) {
300  unsigned char c = val[vp];
301  if (c != '<' && c != '=' && c != '>' && (c != "+-"[vp])) {
302  error = GBS_global_string("Invalid content '%s' in 'pos_certain' (expected 2 from \"<=>\")", val);
303  }
304  }
305  if (!error) {
306  pos->start_uncertain[p] = val[0];
307  pos->stop_uncertain[p] = val[1];
308  }
309  }
310  }
311  }
312  }
313 
314  gb_assert(error || pos);
315  if (error) {
316  GB_export_error(error);
317  if (pos) {
318  GEN_free_position(pos);
319  pos = NULp;
320  }
321  }
322  return pos;
323 }
324 
325 GB_ERROR GEN_write_position(GBDATA *gb_gene, const GEN_position *pos, long seqLength) {
326  // if 'seqLength' is != 0, it is used to check the correctness of 'pos'
327  // (otherwise this function reads the genome-sequence to detect its length)
328 
329  GB_ERROR error = NULp;
330  GBDATA *gb_pos_joined = GB_entry(gb_gene, "pos_joined");
331  GBDATA *gb_pos_certain = GB_entry(gb_gene, "pos_certain");
332  GBDATA *gb_pos_start;
333  GBDATA *gb_pos_stop;
334  GBDATA *gb_pos_complement;
335  int p;
336 
337  gb_assert(pos);
338 
339  gb_pos_start = GB_search(gb_gene, "pos_start", GB_STRING);
340  if (!gb_pos_start) error = GB_await_error();
341 
342  if (!error) {
343  gb_pos_stop = GB_search(gb_gene, "pos_stop", GB_STRING);
344  if (!gb_pos_stop) error = GB_await_error();
345  }
346  if (!error) {
347  gb_pos_complement = GB_search(gb_gene, "pos_complement", GB_STRING);
348  if (!gb_pos_complement) error = GB_await_error();
349  }
350 
351  if (!error) {
352  if (pos->start_uncertain) {
353  if (!gb_pos_certain) {
354  gb_pos_certain = GB_search(gb_gene, "pos_certain", GB_STRING);
355  if (!gb_pos_certain) error = GB_await_error();
356  }
357  }
358  else {
359  if (gb_pos_certain) {
360  error = GB_delete(gb_pos_certain);
361  gb_pos_certain = NULp;
362  }
363  }
364  }
365 
366  // test data
367  if (!error) {
368  size_t length;
369 
370  if (seqLength) {
371  length = seqLength;
372  }
373  else { // unknown -> autodetect
374  GBDATA *gb_organism = GB_get_grandfather(gb_gene);
375  GBDATA *gb_genome = GBT_find_sequence(gb_organism, GENOM_ALIGNMENT);
376 
377  length = GB_read_count(gb_genome);
378  }
379 
380  for (p = 0; p<pos->parts && !error; ++p) {
381  char c;
382 
383  c = pos->complement[p]; gb_assert(c == 0 || c == 1);
384  if (c<0 || c>1) {
385  error = GBS_global_string("Illegal value %i in complement", int(c));
386  }
387  else {
388  if (pos->start_pos[p]>pos->stop_pos[p]) {
389  error = GBS_global_string("Illegal positions (%zu>%zu)", pos->start_pos[p], pos->stop_pos[p]);
390  }
391  else if (pos->start_pos[p] == 0) {
392  error = GBS_global_string("Illegal start position %zu", pos->start_pos[p]);
393  }
394  else if (pos->stop_pos[p] > length) {
395  error = GBS_global_string("Illegal stop position %zu (>length(=%zu))", pos->stop_pos[p], length);
396  }
397  else {
398  if (pos->start_uncertain) {
399  c = pos->start_uncertain[p];
400  char c2 = pos->stop_uncertain[p];
401 
402  if (!c || !strchr("<=>+", c)) error = GBS_global_string("Invalid uncertainty '%c'", c);
403  else if (!c2 || !strchr("<=>-", c2)) error = GBS_global_string("Invalid uncertainty '%c'", c2);
404  else {
405  if (c == '+' || c2 == '-') {
406  if (c == '+' && c2 == '-') {
407  if (pos->start_pos[p] != pos->stop_pos[p]-1) {
408  error = GBS_global_string("Invalid positions %zu^%zu for uncertainties +-", pos->start_pos[p], pos->stop_pos[p]);
409  }
410  }
411  else {
412  error = "uncertainties '+' and '-' can only be used together";
413  }
414  }
415  }
416  }
417  }
418  }
419  }
420  }
421 
422  if (!error) {
423  if (pos->parts == 1) {
424  if (gb_pos_joined) error = GB_delete(gb_pos_joined);
425 
426  if (!error) error = GB_write_string(gb_pos_start, GBS_global_string("%zu", pos->start_pos[0]));
427  if (!error) error = GB_write_string(gb_pos_stop, GBS_global_string("%zu", pos->stop_pos[0]));
428  if (!error) error = GB_write_string(gb_pos_complement, GBS_global_string("%c", pos->complement[0]+'0'));
429 
430  if (!error && gb_pos_certain) {
431  error = GB_write_string(gb_pos_certain, GBS_global_string("%c%c", pos->start_uncertain[0], pos->stop_uncertain[0]));
432  }
433  }
434  else {
435  if (!gb_pos_joined) {
436  gb_pos_joined = GB_search(gb_gene, "pos_joined", GB_INT);
437  if (!gb_pos_joined) error = GB_await_error();
438  }
439  if (!error) error = GB_write_int(gb_pos_joined, pos->parts * (pos->joinable ? 1 : -1)); // neg. parts means not joinable
440 
441  if (!error) {
442  GBS_strstruct start(12*pos->parts);
443  GBS_strstruct stop(12*pos->parts);
444  GBS_strstruct complement(2*pos->parts);
445  GBS_strstruct uncertain(3*pos->parts);
446 
447  for (p = 0; p<pos->parts; ++p) {
448  if (p>0) {
449  start.put(',');
450  stop.put(',');
451  complement.put(',');
452  uncertain.put(',');
453  }
454  start.cat(GBS_global_string("%zu", pos->start_pos[p])); // @@@ use nprintf (or add putsize_t)
455  stop.cat(GBS_global_string("%zu", pos->stop_pos[p])); // @@@ use nprintf (or add putsize_t)
456  complement.put(pos->complement[p]+'0');
457  if (gb_pos_certain) {
458  uncertain.put(pos->start_uncertain[p]);
459  uncertain.put(pos->stop_uncertain[p]);
460  }
461  }
462 
463  error = GB_write_string(gb_pos_start, start.get_data());
464  if (!error) error = GB_write_string(gb_pos_stop, stop.get_data());
465  if (!error) error = GB_write_string(gb_pos_complement, complement.get_data());
466  if (!error && gb_pos_certain) error = GB_write_string(gb_pos_certain, uncertain.get_data());
467  }
468  }
469  }
470 
471  return error;
472 }
473 
475 
476 static int cmp_location_parts(const void *v1, const void *v2) {
477  int i1 = *(int*)v1;
478  int i2 = *(int*)v2;
479 
480  int cmp = location2sort->start_pos[i1]-location2sort->start_pos[i2];
481  if (!cmp) {
482  cmp = location2sort->stop_pos[i1]-location2sort->stop_pos[i2];
483  }
484  return cmp;
485 }
486 
488  // Note: makes location partly invalid (only start_pos + stop_pos are valid afterwards)
489  int parts = location->parts;
490  int *idx = ARB_alloc<int>(parts); // idx[newpos] = oldpos
491  int i, p;
492 
493  for (p = 0; p<parts; ++p) idx[p] = p; // IRRELEVANT_LOOP
494 
495  location2sort = location;
496  qsort(idx, parts, sizeof(*idx), cmp_location_parts);
497  location2sort = NULp;
498 
499  for (p = 0; p<parts; ++p) {
500  i = idx[p];
501 
502 #define swap(a, b, type) do { type tmp = (a); (a) = (b); (b) = (tmp); } while (0)
503 
504  if (i != p) {
505  swap(location->start_pos[i], location->start_pos[p], size_t);
506  swap(location->stop_pos[i], location->stop_pos[p], size_t);
507  swap(idx[i], idx[p], int);
508  }
509  }
510 
511 #if defined(DEBUG) && 0
512  printf("Locations sorted:\n");
513  for (p = 0; p<parts; ++p) {
514  printf(" [%i] %i - %i %i\n", p, location->start_pos[p], location->stop_pos[p], (int)(location->complement[p]));
515  }
516 #endif // DEBUG
517 
518  i = 0;
519  for (p = 1; p<parts; p++) {
520  if ((location->stop_pos[i]+1) >= location->start_pos[p]) {
521  // parts overlap or are directly consecutive
522 
523  location->stop_pos[i] = location->stop_pos[p];
524  }
525  else {
526  i++;
527  location->start_pos[i] = location->start_pos[p];
528  location->stop_pos[i] = location->stop_pos[p];
529  }
530  }
531  location->parts = i+1;
532 
533 #if defined(DEBUG) && 0
534  parts = location->parts;
535  printf("Locations merged:\n");
536  for (p = 0; p<parts; ++p) {
537  printf(" [%i] %i - %i %i\n", p, location->start_pos[p], location->stop_pos[p], (int)(location->complement[p]));
538  }
539 #endif // DEBUG
540 
541  free(idx);
542 }
543 
544 
545 
546 // -----------------------------------------
547 // test if species is pseudo-species
548 
549 const char *GEN_origin_organism(GBDATA *gb_pseudo) {
550  GBDATA *gb_origin = GB_entry(gb_pseudo, "ARB_origin_species");
551  return gb_origin ? GB_read_char_pntr(gb_origin) : NULp;
552 }
553 const char *GEN_origin_gene(GBDATA *gb_pseudo) {
554  GBDATA *gb_origin = GB_entry(gb_pseudo, "ARB_origin_gene");
555  return gb_origin ? GB_read_char_pntr(gb_origin) : NULp;
556 }
557 
559  return GEN_origin_organism(gb_species);
560 }
561 
562 // ------------------------------------------------
563 // find organism or gene for pseudo-species
564 
568 
569  return GB_export_errorf("The gene-species '%s' refers to an unknown organism (%s)\n"
570  "This occurs if you rename or delete the organism or change the entry\n"
571  "'ARB_origin_species' and will most likely cause serious problems.",
572  GBT_get_name_or_description(gb_pseudo),
573  GEN_origin_organism(gb_pseudo));
574 }
575 
576 // @@@ FIXME: missing: GEN_gene_not_found (like GEN_organism_not_found)
577 
578 // ------------------------------
579 // search pseudo species
580 
581 static const char *pseudo_species_hash_key(const char *organism_name, const char *gene_name) {
582  return GBS_global_string("%s*%s", organism_name, gene_name);
583 }
584 
585 static GBDATA *GEN_read_pseudo_species_from_hash(const GB_HASH *pseudo_hash, const char *organism_name, const char *gene_name) {
586  return (GBDATA*)GBS_read_hash(pseudo_hash, pseudo_species_hash_key(organism_name, gene_name));
587 }
588 
589 void GEN_add_pseudo_species_to_hash(GBDATA *gb_pseudo, GB_HASH *pseudo_hash) {
590  const char *organism_name = GEN_origin_organism(gb_pseudo);
591  const char *gene_name = GEN_origin_gene(gb_pseudo);
592 
593  gb_assert(organism_name);
594  gb_assert(gene_name);
595 
596  GBS_write_hash(pseudo_hash, pseudo_species_hash_key(organism_name, gene_name), (long)gb_pseudo);
597 }
598 
600  GB_HASH *pseudo_hash = GBS_create_hash(GBT_get_species_count(gb_main)+additionalSize, GB_IGNORE_CASE);
601  GBDATA *gb_pseudo;
602 
603  for (gb_pseudo = GEN_first_pseudo_species(gb_main);
604  gb_pseudo;
605  gb_pseudo = GEN_next_pseudo_species(gb_pseudo))
606  {
607  GEN_add_pseudo_species_to_hash(gb_pseudo, pseudo_hash);
608  }
609 
610  return pseudo_hash;
611 }
612 
613 GBDATA *GEN_find_pseudo_species(GBDATA *gb_main, const char *organism_name, const char *gene_name, const GB_HASH *pseudo_hash) {
614  // parameter pseudo_hash :
615  // 0 -> use slow direct search [if you only search one]
616  // otherwise it shall be a hash generated by GEN_create_pseudo_species_hash() [if you search several times]
617  // Note : use GEN_add_pseudo_species_to_hash to keep hash up-to-date
618  GBDATA *gb_pseudo;
619 
620  if (pseudo_hash) {
621  gb_pseudo = GEN_read_pseudo_species_from_hash(pseudo_hash, organism_name, gene_name);
622  }
623  else {
624  for (gb_pseudo = GEN_first_pseudo_species(gb_main);
625  gb_pseudo;
626  gb_pseudo = GEN_next_pseudo_species(gb_pseudo))
627  {
628  const char *origin_gene_name = GEN_origin_gene(gb_pseudo);
629  if (strcmp(gene_name, origin_gene_name) == 0) {
630  const char *origin_species_name = GEN_origin_organism(gb_pseudo);
631  if (strcmp(organism_name, origin_species_name) == 0) {
632  break; // found pseudo species
633  }
634  }
635  }
636  }
637  return gb_pseudo;
638 }
639 
640 // -----------------------
641 // search origins
642 
643 GBDATA *GEN_find_origin_organism(GBDATA *gb_pseudo, const GB_HASH *organism_hash) {
644  // parameter organism_hash:
645  // 0 -> use slow direct search [if you only search one or two]
646  // otherwise it shall be a hash generated by GBT_create_organism_hash() [if you search several times]
647  // Note : use GBT_add_item_to_hash() to keep hash up-to-date
648 
649  const char *origin_species_name;
650  GBDATA *gb_organism = NULp;
652 
653  origin_species_name = GEN_origin_organism(gb_pseudo);
654  if (origin_species_name) {
655  if (organism_hash) {
656  gb_organism = (GBDATA*)GBS_read_hash(organism_hash, origin_species_name);
657  }
658  else {
659  gb_organism = GBT_find_species_rel_species_data(GB_get_father(gb_pseudo), origin_species_name);
660  }
661  }
662 
663  return gb_organism;
664 }
665 
666 GBDATA *GEN_find_origin_gene(GBDATA *gb_pseudo, const GB_HASH *organism_hash) {
667  const char *origin_gene_name;
668 
670 
671  origin_gene_name = GEN_origin_gene(gb_pseudo);
672  if (origin_gene_name) {
673  GBDATA *gb_organism = GEN_find_origin_organism(gb_pseudo, organism_hash);
674  gb_assert(gb_organism);
675 
676  return GEN_find_gene(gb_organism, origin_gene_name);
677  }
678  return NULp;
679 }
680 
681 // --------------------------------
682 // find pseudo-species
683 
685  GBDATA *gb_species = GBT_first_species(gb_main);
686 
687  if (!gb_species || GEN_is_pseudo_gene_species(gb_species)) return gb_species;
688  return GEN_next_pseudo_species(gb_species);
689 }
690 
692  if (gb_species) {
693  while (1) {
694  gb_species = GBT_next_species(gb_species);
695  if (!gb_species || GEN_is_pseudo_gene_species(gb_species)) break;
696  }
697  }
698  return gb_species;
699 }
700 
702  if (gb_species) {
703  while (1) {
704  gb_species = GBT_next_marked_species(gb_species);
705  if (!gb_species || GEN_is_pseudo_gene_species(gb_species)) break;
706  }
707  }
708  return gb_species;
709 }
710 
712  GBDATA *gb_species = GBT_first_marked_species(gb_main);
713 
714  if (!gb_species || GEN_is_pseudo_gene_species(gb_species)) return gb_species;
715  return GEN_next_marked_pseudo_species(gb_species);
716 }
717 
718 // ------------------
719 // organisms
720 
721 bool GEN_is_organism(GBDATA *gb_species) {
722  gb_assert(GEN_is_genome_db(GB_get_root(gb_species), -1)); // assert this is a genome db
723  // otherwise it is an error to use GEN_is_organism (or its callers)!!!!
724 
725  return GB_entry(gb_species, GENOM_ALIGNMENT);
726 }
727 
728 GBDATA *GEN_find_organism(GBDATA *gb_main, const char *name) {
729  GBDATA *gb_orga = GBT_find_species(gb_main, name);
730  if (gb_orga) {
731  if (!GEN_is_organism(gb_orga)) {
732  fprintf(stderr, "ARBDB-warning: found unspecific species named '%s', but expected an 'organism' with that name\n", name);
733  gb_orga = NULp;
734  }
735  }
736  return gb_orga;
737 }
738 
740  GBDATA *gb_organism = GBT_first_species(gb_main);
741 
742  if (!gb_organism || GEN_is_organism(gb_organism)) return gb_organism;
743  return GEN_next_organism(gb_organism);
744 }
746  if (gb_organism) {
747  while (1) {
748  gb_organism = GBT_next_species(gb_organism);
749  if (!gb_organism || GEN_is_organism(gb_organism)) break;
750  }
751  }
752  return gb_organism;
753 
754 }
755 
757  long count = 0;
758  GBDATA *gb_organism = GEN_first_organism(gb_main);
759  while (gb_organism) {
760  count++;
761  gb_organism = GEN_next_organism(gb_organism);
762  }
763  return count;
764 }
765 
766 
768  GBDATA *gb_organism = GBT_first_marked_species(gb_main);
769 
770  if (!gb_organism || GEN_is_organism(gb_organism)) return gb_organism;
771  return GEN_next_marked_organism(gb_organism);
772 }
774  if (gb_organism) {
775  while (1) {
776  gb_organism = GBT_next_marked_species(gb_organism);
777  if (!gb_organism || GEN_is_organism(gb_organism)) break;
778  }
779  }
780  return gb_organism;
781 }
782 
783 char *GEN_global_gene_identifier(GBDATA *gb_gene, GBDATA *gb_organism) {
784  if (!gb_organism) {
785  gb_organism = GB_get_grandfather(gb_gene);
786  gb_assert(gb_organism);
787  }
788 
790 }
791 
792 // --------------------------------------------------------------------------------
793 
794 #ifdef UNIT_TESTS
795 #include <test_unit.h>
796 #include <arb_unit_test.h>
797 #include <arb_defs.h>
798 
799 static struct arb_unit_test::test_alignment_data TestAlignmentData_Genome[] = {
800  { 0, "spec", "AUCUCCUAAACCCAACCGUAGUUCGAAUUGAG" },
801 };
802 
803 #define TEST_EXPECT_MEMBER_EQUAL(s1,s2,member) TEST_EXPECT_EQUAL((s1)->member, (s2)->member)
804 
805 #define TEST_EXPECT_GENPOS_EQUAL(p1,p2) do { \
806  TEST_EXPECT_MEMBER_EQUAL(p1, p2, parts); \
807  TEST_EXPECT_MEMBER_EQUAL(p1, p2, joinable); \
808  for (int p = 0; p<(p1)->parts; ++p) { \
809  TEST_EXPECT_MEMBER_EQUAL(p1, p2, start_pos[p]); \
810  TEST_EXPECT_MEMBER_EQUAL(p1, p2, stop_pos[p]); \
811  TEST_EXPECT_MEMBER_EQUAL(p1, p2, complement[p]); \
812  if ((p1)->start_uncertain) { \
813  TEST_EXPECT_MEMBER_EQUAL(p1, p2, start_uncertain[p]); \
814  TEST_EXPECT_MEMBER_EQUAL(p1, p2, stop_uncertain[p]); \
815  } \
816  } \
817  } while(0)
818 
819 #define TEST_WRITE_READ_GEN_POSITION(pos) \
820  do { \
821  error = GEN_write_position(gb_gene, (pos), 0); \
822  if (!error) { \
823  GEN_position *rpos = GEN_read_position(gb_gene); \
824  if (!rpos) { \
825  error = GB_await_error(); \
826  } \
827  else { \
828  TEST_EXPECT_GENPOS_EQUAL((pos), rpos); \
829  GEN_free_position(rpos); \
830  } \
831  } \
832  TEST_EXPECT_NULL(error.deliver()); \
833  } while(0)
834 
835 #define TEST_WRITE_GEN_POSITION_ERROR(pos,exp_error) do { \
836  error = GEN_write_position(gb_gene, &*(pos), 0); \
837  TEST_EXPECT_EQUAL(error.deliver(), exp_error); \
838  } while(0)
839 
840 #define TEST_GENPOS_FIELD(field,value) do { \
841  GBDATA *gb_field = GB_entry(gb_gene, (field)); \
842  if ((value)) { \
843  TEST_REJECT_NULL(gb_field); \
844  TEST_EXPECT_EQUAL(GB_read_char_pntr(gb_field), (value)); \
845  } \
846  else { \
847  TEST_EXPECT_NULL(gb_field); \
848  } \
849  } while(0)
850 
851 #define TEST_GENPOS_FIELDS(start,stop,complement,certain) do { \
852  TEST_GENPOS_FIELD("pos_start", start); \
853  TEST_GENPOS_FIELD("pos_stop", stop); \
854  TEST_GENPOS_FIELD("pos_complement", complement); \
855  TEST_GENPOS_FIELD("pos_certain", certain); \
856  } while(0)
857 
858 #define TEST_GENE_SEQ_AND_LENGTH(werr,wseq,wlen) do { \
859  size_t len; \
860  char *seq = GBT_read_gene_sequence_and_length(gb_gene, true, '-', &len); \
861  TEST_EXPECT_EQUAL(GB_have_error(), werr); \
862  if (seq) { \
863  TEST_EXPECT_EQUAL(len, (size_t)(wlen)); \
864  TEST_EXPECT_EQUAL(seq, (wseq)); \
865  free(seq); \
866  } \
867  else { \
868  GB_clear_error(); \
869  } \
870  } while(0)
871 
872 void TEST_GEN_position() {
873  // see also ../GENOM_IMPORT/Location.cxx@TEST_gene_location
874 
875  GB_shell shell;
877  GBDATA *gb_main = TEST_CREATE_DB(error, "ali_genom", TestAlignmentData_Genome, false);
878 
879  TEST_EXPECT_NULL(error.deliver());
880 
881  {
882  GB_transaction ta(gb_main);
883 
884  GBDATA *gb_organism = GBT_find_species(gb_main, "spec"); TEST_REJECT_NULL(gb_organism);
885  GBDATA *gb_gene_data = GEN_findOrCreate_gene_data(gb_organism); TEST_REJECT_NULL(gb_gene_data);
886  GBDATA *gb_gene = GEN_create_nonexisting_gene_rel_gene_data(gb_gene_data, "gene"); TEST_REJECT_NULL(gb_gene);
887 
888  typedef SmartCustomPtr(GEN_position, GEN_free_position) GEN_position_Ptr;
889  GEN_position_Ptr pos;
890 
891  pos = GEN_new_position(1, false);
892 
893  TEST_WRITE_GEN_POSITION_ERROR(pos, "Illegal start position 0");
894 
895  pos->start_pos[0] = 5;
896  pos->stop_pos[0] = 10;
897  pos->complement[0] = 1;
898 
899  GEN_use_uncertainties(&*pos);
900 
901  TEST_WRITE_READ_GEN_POSITION(&*pos);
902  TEST_GENPOS_FIELDS("5", "10", "1", "==");
903 
904  TEST_GENE_SEQ_AND_LENGTH(false, "TTTAGG", 6);
905 
906  // ----------
907 
908  pos = GEN_new_position(3, false);
909 
910  TEST_WRITE_GEN_POSITION_ERROR(pos, "Illegal start position 0");
911 
912  GEN_use_uncertainties(&*pos);
913 
914  pos->start_pos[0] = 5; pos->start_pos[1] = 10; pos->start_pos[2] = 25;
915  pos->stop_pos[0] = 15; pos->stop_pos[1] = 20; pos->stop_pos[2] = 25;
916  pos->complement[0] = 0; pos->complement[1] = 1; pos->complement[2] = 0;
917 
918  pos->start_uncertain[0] = '<';
919  pos->stop_uncertain[2] = '>';
920 
921  TEST_WRITE_READ_GEN_POSITION(&*pos);
922  TEST_GENPOS_FIELDS("5,10,25", "15,20,25", "0,1,0", "<=,==,=>");
923 
924  TEST_GENE_SEQ_AND_LENGTH(false, "CCUAAACCCAA-TACGGTTGGGT-G", 25);
925 
926  pos->stop_uncertain[2] = 'x';
927  TEST_WRITE_GEN_POSITION_ERROR(pos, "Invalid uncertainty 'x'");
928 
929  pos->stop_uncertain[2] = '+';
930  TEST_WRITE_GEN_POSITION_ERROR(pos, "Invalid uncertainty '+'"); // invalid for stop
931 
932  pos->start_uncertain[2] = '+';
933  pos->stop_uncertain[2] = '-';
934  TEST_WRITE_GEN_POSITION_ERROR(pos, "Invalid positions 25^25 for uncertainties +-");
935 
936  pos->stop_pos[2] = 26;
937  TEST_WRITE_GEN_POSITION_ERROR(pos, (void*)NULp);
938 
939  pos->stop_pos[0] = 100;
940  TEST_WRITE_GEN_POSITION_ERROR(pos, "Illegal stop position 100 (>length(=32))");
941  }
942 
943  GB_close(gb_main);
944 }
945 
946 #endif // UNIT_TESTS
947 
GBDATA * GEN_create_nonexisting_gene(GBDATA *gb_species, const char *name)
Definition: adGene.cxx:96
const char * GB_ERROR
Definition: arb_core.h:25
GBDATA * GEN_next_pseudo_species(GBDATA *gb_species)
Definition: adGene.cxx:691
static GBDATA * GEN_read_pseudo_species_from_hash(const GB_HASH *pseudo_hash, const char *organism_name, const char *gene_name)
Definition: adGene.cxx:585
GBDATA * GEN_first_gene(GBDATA *gb_species)
Definition: adGene.cxx:130
unsigned char * complement
Definition: adGene.h:41
GBDATA * GBT_first_marked_species(GBDATA *gb_main)
Definition: aditem.cxx:113
size_t size() const
Definition: arb_strarray.h:85
long GB_read_int(GBDATA *gbd)
Definition: arbdb.cxx:729
static GB_ERROR parsePositions(GBDATA *gb_gene, const char *field_name, int parts_expected, size_t *results, ConstStrArray &parseTable)
Definition: adGene.cxx:232
long GBS_write_hash(GB_HASH *hs, const char *key, long val)
Definition: adhash.cxx:454
GB_ERROR GEN_write_position(GBDATA *gb_gene, const GEN_position *pos, long seqLength)
Definition: adGene.cxx:325
GB_ERROR GB_write_string(GBDATA *gbd, const char *s)
Definition: arbdb.cxx:1387
GBDATA * GEN_find_origin_organism(GBDATA *gb_pseudo, const GB_HASH *organism_hash)
Definition: adGene.cxx:643
static GEN_position * lastFreedPosition
Definition: adGene.cxx:153
const char * GEN_origin_organism(GBDATA *gb_pseudo)
Definition: adGene.cxx:549
GBDATA * GBT_expect_item_rel_item_data(GBDATA *gb_item_data, const char *id_field, const char *id_value)
Definition: aditem.cxx:91
GBDATA * GB_nextEntry(GBDATA *entry)
Definition: adquery.cxx:339
GB_ERROR GB_end_transaction(GBDATA *gbd, GB_ERROR error)
Definition: arbdb.cxx:2561
bool GEN_is_organism(GBDATA *gb_species)
Definition: adGene.cxx:721
GBDATA * GEN_expect_gene(GBDATA *gb_species, const char *name)
Definition: adGene.cxx:72
int parts
Definition: adGene.h:37
const char * GBS_global_string(const char *templat,...)
Definition: arb_msg.cxx:203
void GBK_terminatef(const char *templat,...)
Definition: arb_msg.cxx:523
#define swap(a, b, type)
void cat(const char *from)
Definition: arb_strbuf.h:183
char * GEN_global_gene_identifier(GBDATA *gb_gene, GBDATA *gb_organism)
Definition: adGene.cxx:783
GEN_position * GEN_new_position(int parts, bool joinable)
Definition: adGene.cxx:155
GBDATA * GEN_find_origin_gene(GBDATA *gb_pseudo, const GB_HASH *organism_hash)
Definition: adGene.cxx:666
GBDATA * GEN_next_marked_organism(GBDATA *gb_organism)
Definition: adGene.cxx:773
GBDATA * GB_get_grandfather(GBDATA *gbd)
Definition: arbdb.cxx:1728
GBDATA * GB_get_father(GBDATA *gbd)
Definition: arbdb.cxx:1722
GB_ERROR GB_push_transaction(GBDATA *gbd)
Definition: arbdb.cxx:2494
static struct GEN_position_mem_handler GEN_position_dealloc
GBDATA * GEN_first_marked_pseudo_species(GBDATA *gb_main)
Definition: adGene.cxx:711
static GB_ERROR parseCSV(GBDATA *gb_gene, const char *field_name, size_t parts_expected, ConstStrArray &parseTable)
Definition: adGene.cxx:211
GB_ERROR GB_delete(GBDATA *&source)
Definition: arbdb.cxx:1916
static HelixNrInfo * start
static const char * pseudo_species_hash_key(const char *organism_name, const char *gene_name)
Definition: adGene.cxx:581
#define SmartCustomPtr(type, deallocator)
Definition: smartptr.h:46
GB_ERROR GB_export_error(const char *error)
Definition: arb_msg.cxx:257
GB_ERROR GB_await_error()
Definition: arb_msg.cxx:342
GBDATA * GB_create_container(GBDATA *father, const char *key)
Definition: arbdb.cxx:1829
long GB_read_count(GBDATA *gbd)
Definition: arbdb.cxx:758
GB_ERROR deliver() const
Definition: arb_error.h:116
long GEN_get_organism_count(GBDATA *gb_main)
Definition: adGene.cxx:756
void GEN_add_pseudo_species_to_hash(GBDATA *gb_pseudo, GB_HASH *pseudo_hash)
Definition: adGene.cxx:589
GBDATA * GB_create(GBDATA *father, const char *key, GB_TYPES type)
Definition: arbdb.cxx:1781
GBDATA * GEN_next_marked_gene(GBDATA *gb_gene)
Definition: adGene.cxx:146
unsigned char * start_uncertain
Definition: adGene.h:53
static int cmp_location_parts(const void *v1, const void *v2)
Definition: adGene.cxx:476
static GEN_position * location2sort
Definition: adGene.cxx:474
GBDATA * GEN_first_gene_rel_gene_data(GBDATA *gb_gene_data)
Definition: adGene.cxx:134
const char * GEN_origin_gene(GBDATA *gb_pseudo)
Definition: adGene.cxx:553
#define false
Definition: ureadseq.h:13
GB_HASH * GEN_create_pseudo_species_hash(GBDATA *gb_main, long additionalSize)
Definition: adGene.cxx:599
GBDATA * GEN_find_gene_rel_gene_data(GBDATA *gb_gene_data, const char *name)
Definition: adGene.cxx:60
size_t * stop_pos
Definition: adGene.h:40
#define TEST_REJECT_NULL(n)
Definition: test_unit.h:1325
static void error(const char *msg)
Definition: mkptypes.cxx:96
GBDATA * GB_get_root(GBDATA *gbd)
Definition: arbdb.cxx:1740
bool GEN_is_pseudo_gene_species(GBDATA *gb_species)
Definition: adGene.cxx:558
GB_ERROR GEN_organism_not_found(GBDATA *gb_pseudo)
Definition: adGene.cxx:565
void GBT_splitNdestroy_string(ConstStrArray &names, char *&namelist, const char *separator, bool dropEmptyTokens)
bool GEN_is_genome_db(GBDATA *gb_main, int default_value)
Definition: adGene.cxx:20
GBDATA * GEN_first_organism(GBDATA *gb_main)
Definition: adGene.cxx:739
GBDATA * GBT_next_marked_species(GBDATA *gb_species)
Definition: aditem.cxx:116
GEN_position * GEN_read_position(GBDATA *gb_gene)
Definition: adGene.cxx:250
GBDATA * GEN_next_gene(GBDATA *gb_gene)
Definition: adGene.cxx:138
void GEN_use_uncertainties(GEN_position *pos)
Definition: adGene.cxx:184
#define cmp(h1, h2)
Definition: admap.cxx:50
GBDATA * GBT_find_species_rel_species_data(GBDATA *gb_species_data, const char *name)
Definition: aditem.cxx:133
void GEN_free_position(GEN_position *pos)
Definition: adGene.cxx:195
GBDATA * GEN_first_marked_organism(GBDATA *gb_main)
Definition: adGene.cxx:767
bool GB_has_key(GBDATA *gbd, const char *key)
Definition: arbdb.cxx:1707
Definition: arbdb.h:86
#define GENOM_ALIGNMENT
Definition: adGene.h:19
GBDATA * GBT_find_sequence(GBDATA *gb_species, const char *aliname)
Definition: adali.cxx:708
GB_ERROR GB_write_int(GBDATA *gbd, long i)
Definition: arbdb.cxx:1250
GBDATA * GEN_find_organism(GBDATA *gb_main, const char *name)
Definition: adGene.cxx:728
#define GENOM_DB_TYPE
Definition: adGene.h:20
void GEN_sortAndMergeLocationParts(GEN_position *location)
Definition: adGene.cxx:487
GB_ERROR GB_export_errorf(const char *templat,...)
Definition: arb_msg.cxx:262
static GBDATA * GEN_create_nonexisting_gene_rel_gene_data(GBDATA *gb_gene_data, const char *name)
Definition: adGene.cxx:78
TYPE * ARB_calloc(size_t nelem)
Definition: arb_mem.h:81
GBDATA * GEN_findOrCreate_gene_data(GBDATA *gb_species)
Definition: adGene.cxx:44
#define TEST_EXPECT_NULL(n)
Definition: test_unit.h:1322
#define gb_assert(cond)
Definition: arbdbt.h:11
static GBDATA * GEN_next_marked_pseudo_species(GBDATA *gb_species)
Definition: adGene.cxx:701
GBDATA * GEN_find_gene(GBDATA *gb_species, const char *name)
Definition: adGene.cxx:64
GBDATA * GB_next_marked(GBDATA *gbd, const char *keystring)
Definition: adquery.cxx:670
#define assert_or_exit(cond)
Definition: arb_assert.h:263
GB_ERROR GBT_write_string(GBDATA *gb_container, const char *fieldpath, const char *content)
Definition: adtools.cxx:451
GBDATA * GEN_find_pseudo_species(GBDATA *gb_main, const char *organism_name, const char *gene_name, const GB_HASH *pseudo_hash)
Definition: adGene.cxx:613
char * GB_read_string(GBDATA *gbd)
Definition: arbdb.cxx:909
GBDATA * GBT_first_species(GBDATA *gb_main)
Definition: aditem.cxx:124
GBDATA * GEN_find_or_create_gene_rel_gene_data(GBDATA *gb_gene_data, const char *name)
Definition: adGene.cxx:100
size_t * start_pos
Definition: adGene.h:39
GBDATA * GB_find_string(GBDATA *gbd, const char *key, const char *str, GB_CASE case_sens, GB_SEARCH_TYPE gbs)
Definition: adquery.cxx:302
GBDATA * GBT_next_species(GBDATA *gb_species)
Definition: aditem.cxx:128
#define NULp
Definition: cxxforward.h:114
GBDATA * GBT_find_species(GBDATA *gb_main, const char *name)
Definition: aditem.cxx:139
void reserve(size_t forElems)
Definition: arb_strarray.h:82
const char * get_data() const
Definition: arb_strbuf.h:120
GBDATA * GEN_first_marked_gene(GBDATA *gb_species)
Definition: adGene.cxx:143
GBDATA * GEN_find_gene_data(GBDATA *gb_species)
Definition: adGene.cxx:50
GBDATA * GB_first_marked(GBDATA *gbd, const char *keystring)
Definition: adquery.cxx:655
long GBT_get_species_count(GBDATA *gb_main)
Definition: aditem.cxx:207
GBDATA * GBT_find_item_rel_item_data(GBDATA *gb_item_data, const char *id_field, const char *id_value)
Definition: aditem.cxx:74
GBDATA * GEN_first_pseudo_species(GBDATA *gb_main)
Definition: adGene.cxx:684
GB_transaction ta(gb_var)
bool joinable
Definition: adGene.h:38
Definition: trnsprob.h:20
GB_CSTR GB_read_char_pntr(GBDATA *gbd)
Definition: arbdb.cxx:904
GBDATA * gb_main
Definition: adname.cxx:32
GBDATA * GB_search(GBDATA *gbd, const char *fieldpath, GB_TYPES create)
Definition: adquery.cxx:531
GB_CSTR GBT_get_name_or_description(GBDATA *gb_item)
Definition: aditem.cxx:459
size_t length
long GBS_read_hash(const GB_HASH *hs, const char *key)
Definition: adhash.cxx:392
GBDATA * GB_entry(GBDATA *father, const char *key)
Definition: adquery.cxx:334
GBDATA * GEN_next_organism(GBDATA *gb_organism)
Definition: adGene.cxx:745
char * GBS_global_string_copy(const char *templat,...)
Definition: arb_msg.cxx:194
void GB_close(GBDATA *gbd)
Definition: arbdb.cxx:655
GBDATA * GEN_expect_gene_data(GBDATA *gb_species)
Definition: adGene.cxx:54
GB_HASH * GBS_create_hash(long estimated_elements, GB_CASE case_sens)
Definition: adhash.cxx:253
void put(char c)
Definition: arb_strbuf.h:158
unsigned char * stop_uncertain
Definition: adGene.h:54
Definition: arbdb.h:66