ARB
adGene.cxx
Go to the documentation of this file.
1 // =============================================================== //
2 // //
3 // File : adGene.cxx //
4 // Purpose : Basic gene access functions //
5 // //
6 // Coded by Ralf Westram (coder@reallysoft.de) in July 2002 //
7 // Institute of Microbiology (Technical University Munich) //
8 // http://www.arb-home.de/ //
9 // //
10 // =============================================================== //
11 
12 #include "gb_local.h"
13 
14 #include <adGene.h>
15 #include <arbdbt.h>
16 #include <arb_strbuf.h>
17 #include <arb_strarray.h>
18 
19 
20 bool GEN_is_genome_db(GBDATA *gb_main, int default_value) {
21  // default_value == 0 -> default to normal database
22  // == 1 -> default to GENOM database
23  // == -1 -> assume that type is already defined (is defined for all DBs ever loaded in ARB-gui)
24  GBDATA *gb_genom_db = GB_entry(gb_main, GENOM_DB_TYPE);
25 
26  if (!gb_genom_db) { // no DB-type entry -> create one with default
28 
29  assert_or_exit(default_value != -1); // first call to GEN_is_genome_db has to provide a 'default_value'
30 
31  gb_genom_db = GB_create(gb_main, GENOM_DB_TYPE, GB_INT);
32  if (!gb_genom_db) error = GB_await_error();
33  else error = GB_write_int(gb_genom_db, default_value);
34 
35  if (error) GBK_terminatef("Fatal in GEN_is_genome_db: %s", error);
36  }
37 
38  return GB_read_int(gb_genom_db) != 0;
39 }
40 
41 // --------------
42 // genes:
43 
45  GBDATA *gb_gene_data = GB_search(gb_species, "gene_data", GB_CREATE_CONTAINER);
46  gb_assert(gb_gene_data);
47  return gb_gene_data;
48 }
49 
51  return GB_search(gb_species, "gene_data", GB_FIND);
52 }
53 
55  GBDATA *gb_gene_data = GB_search(gb_species, "gene_data", GB_FIND);
56  gb_assert(gb_gene_data);
57  return gb_gene_data;
58 }
59 
60 GBDATA* GEN_find_gene_rel_gene_data(GBDATA *gb_gene_data, const char *name) {
61  return GBT_find_item_rel_item_data(gb_gene_data, "name", name);
62 }
63 
64 GBDATA* GEN_find_gene(GBDATA *gb_species, const char *name) {
65  // Search for a gene.
66  // Return found gene or NULp (in this case an error MAY be exported).
67  //
68  // Note: If you expect the gene to exists, use GEN_expect_gene!
69  GBDATA *gb_gene_data = GEN_find_gene_data(gb_species);
70  return gb_gene_data ? GEN_find_gene_rel_gene_data(gb_gene_data, name) : NULp;
71 }
72 GBDATA* GEN_expect_gene(GBDATA *gb_species, const char *name) {
73  // Returns an existing gene or
74  // NULp (in that case an error is exported)
75  return GBT_expect_item_rel_item_data(GEN_expect_gene_data(gb_species), "name", name);
76 }
77 
78 static GBDATA* GEN_create_nonexisting_gene_rel_gene_data(GBDATA *gb_gene_data, const char *name) {
79  GB_ERROR error = GB_push_transaction(gb_gene_data);
80  GBDATA *gb_gene = NULp;
81 
82  gb_assert(!GEN_find_gene_rel_gene_data(gb_gene_data, name)); // don't call this function if you are not sure that the gene does not exists!
83 
84  if (!error) {
85  gb_gene = GB_create_container(gb_gene_data, "gene");
86  error = gb_gene ? GBT_write_string(gb_gene, "name", name) : GB_await_error();
87  }
88 
89  gb_assert(gb_gene || error);
90  error = GB_end_transaction(gb_gene_data, error);
91  if (error) GB_export_error(error);
92 
93  return gb_gene;
94 }
95 
96 GBDATA* GEN_create_nonexisting_gene(GBDATA *gb_species, const char *name) { // needed by ../PERL_SCRIPTS/GENOME/GI.pm@create_nonexisting_gene
98 }
99 
100 GBDATA* GEN_find_or_create_gene_rel_gene_data(GBDATA *gb_gene_data, const char *name) {
101  GBDATA *gb_gene = NULp;
102 
103  // Search for a gene, when gene does not exist create it
104  if (!name || !name[0]) {
105  GB_export_error("Missing gene name");
106  }
107  else {
108  GBDATA *gb_name = GB_find_string(gb_gene_data, "name", name, GB_IGNORE_CASE, SEARCH_GRANDCHILD);
109 
110  if (gb_name) {
111  gb_gene = GB_get_father(gb_name); // found existing gene
112  }
113  else {
114  GB_ERROR error = GB_push_transaction(gb_gene_data);
115 
116  if (!error) {
117  gb_gene = GB_create_container(gb_gene_data, "gene");
118  error = GBT_write_string(gb_gene, "name", name);
119  }
120  error = GB_end_transaction(gb_gene_data, error);
121  if (error) {
122  gb_gene = NULp;
123  GB_export_error(error);
124  }
125  }
126  }
127  return gb_gene;
128 }
129 
130 GBDATA* GEN_first_gene(GBDATA *gb_species) {
131  return GB_entry(GEN_expect_gene_data(gb_species), "gene");
132 }
133 
135  return GB_entry(gb_gene_data, "gene");
136 }
137 
139  gb_assert(GB_has_key(gb_gene, "gene"));
140  return GB_nextEntry(gb_gene);
141 }
142 
144  return GB_first_marked(GEN_expect_gene_data(gb_species), "gene");
145 }
147  return GB_next_marked(gb_gene, "gene");
148 }
149 
150 // ----------------------
151 // gene position
152 
154 
155 GEN_position *GEN_new_position(int parts, bool joinable) {
156  GEN_position *pos;
157 
158  size_t pos_size = parts*sizeof(pos->start_pos[0]);
159  size_t comp_size = parts*sizeof(pos->complement[0]);
160  size_t data_size = 2*pos_size+3*comp_size;
161 
162  gb_assert(parts>0);
163 
164  if (lastFreedPosition && lastFreedPosition->parts == parts) {
165  pos = lastFreedPosition;
166  lastFreedPosition = NULp;
167  memset(pos->start_pos, 0, data_size);
168  }
169  else {
170  ARB_calloc(pos, 1);
171  pos->parts = parts;
172  pos->start_pos = (size_t*)ARB_calloc<char>(data_size);
173  pos->stop_pos = pos->start_pos+parts;
174  pos->complement = (unsigned char*)(pos->stop_pos+parts);
175  }
176 
177  pos->joinable = joinable;
178  pos->start_uncertain = NULp;
179  pos->stop_uncertain = NULp;
180 
181  return pos;
182 }
183 
185  if (!pos->start_uncertain) {
186  // space was already allocated in GEN_new_position
187  pos->start_uncertain = pos->complement+pos->parts;
188  pos->stop_uncertain = pos->start_uncertain+pos->parts;
189 
190  size_t comp_size = pos->parts*sizeof(pos->complement[0]);
191  memset(pos->start_uncertain, '=', 2*comp_size);
192  }
193 }
194 
196  if (pos) {
197  if (lastFreedPosition) {
198  free(lastFreedPosition->start_pos); // rest is allocated together with start_pos
199  free(lastFreedPosition);
200  }
201 
202  lastFreedPosition = pos;
203  }
204 }
205 
209 
210 
211 static GB_ERROR parseCSV(GBDATA *gb_gene, const char *field_name, size_t parts_expected, ConstStrArray& parseTable) {
212  // reads a field and splits the content at ','
213  // results are stored in parseTable
214 
215  GB_ERROR error = NULp;
216  GBDATA *gb_field = GB_entry(gb_gene, field_name);
217  if (!gb_field) error = GBS_global_string("Expected entry '%s' missing", field_name);
218  else {
219  char *content = GB_read_string(gb_field);
220  if (!content) error = GB_await_error();
221  else {
222  parseTable.erase();
223  GBT_splitNdestroy_string(parseTable, content, ',');
224  if (parseTable.size() != parts_expected) {
225  error = GBS_global_string("Expected %zu CSV, found %zu", parts_expected, parseTable.size());
226  }
227  }
228  }
229  return error;
230 }
231 
232 static GB_ERROR parsePositions(GBDATA *gb_gene, const char *field_name, int parts_expected, size_t *results, ConstStrArray& parseTable) {
233  GB_ERROR error = parseCSV(gb_gene, field_name, parts_expected, parseTable);
234  if (!error) {
235  int p;
236  for (p = 0; p<parts_expected && !error; p++) {
237  char *end;
238  results[p] = strtol(parseTable[p], &end, 10);
239  if (end == parseTable[p]) { // error
240  error = GBS_global_string("can't convert '%s' to number", parseTable[p]);
241  }
242  }
243  }
244  if (error) {
245  error = GBS_global_string("While parsing field '%s': %s", field_name, error);
246  }
247  return error;
248 }
249 
251  int parts = 1;
252  bool joinable = false;
253  GBDATA *gb_pos_joined = GB_entry(gb_gene, "pos_joined");
254  GEN_position *pos = NULp;
255  GB_ERROR error = NULp;
256 
257  if (gb_pos_joined) {
258  parts = GB_read_int(gb_pos_joined);
259  if (parts != 1) { // split
260  if (parts>1) joinable = true;
261  else if (parts<-1) parts = -parts; // neg value means "not joinable" (comes from feature location 'order(...)')
262  else error = GBS_global_string("Illegal value %i in 'pos_joined'", parts);
263  }
264  }
265 
266  if (!error) {
267  pos = GEN_new_position(parts, joinable);
268 
269  ConstStrArray parseTable;
270  parseTable.reserve(parts);
271 
272  error = parsePositions(gb_gene, "pos_start", parts, pos->start_pos, parseTable);
273  if (!error) error = parsePositions(gb_gene, "pos_stop", parts, pos->stop_pos, parseTable);
274 
275  int p;
276  if (!error) {
277  error = parseCSV(gb_gene, "pos_complement", parts, parseTable);
278  for (p = 0; p<parts && !error; p++) {
279  const char *val = parseTable[p];
280  if ((val[0] != '0' && val[0] != '1') || val[1] != 0) {
281  error = GBS_global_string("Invalid content '%s' in 'pos_complement' (expected: \"01\")", val);
282  }
283  else {
284  pos->complement[p] = (unsigned char)atoi(val);
285  }
286  }
287  }
288 
289  if (!error) {
290  GBDATA *gb_pos_certain = GB_entry(gb_gene, "pos_certain");
291 
292  if (gb_pos_certain) {
293  error = parseCSV(gb_gene, "pos_certain", parts, parseTable);
295  for (p = 0; p<parts && !error; p++) {
296  const unsigned char *val = (unsigned char *)(parseTable[p]);
297  int vp;
298 
299  for (vp = 0; vp<2; vp++) {
300  unsigned char c = val[vp];
301  if (c != '<' && c != '=' && c != '>' && (c != "+-"[vp])) {
302  error = GBS_global_string("Invalid content '%s' in 'pos_certain' (expected 2 from \"<=>\")", val);
303  }
304  }
305  if (!error) {
306  pos->start_uncertain[p] = val[0];
307  pos->stop_uncertain[p] = val[1];
308  }
309  }
310  }
311  }
312  }
313 
314  gb_assert(error || pos);
315  if (error) {
316  GB_export_error(error);
317  if (pos) {
318  GEN_free_position(pos);
319  pos = NULp;
320  }
321  }
322  return pos;
323 }
324 
325 GB_ERROR GEN_write_position(GBDATA *gb_gene, const GEN_position *pos, long seqLength) {
326  // if 'seqLength' is != 0, it is used to check the correctness of 'pos'
327  // (otherwise this function reads the genome-sequence to detect its length)
328 
329  GB_ERROR error = NULp;
330  GBDATA *gb_pos_joined = GB_entry(gb_gene, "pos_joined");
331  GBDATA *gb_pos_certain = GB_entry(gb_gene, "pos_certain");
332  GBDATA *gb_pos_start;
333  GBDATA *gb_pos_stop;
334  GBDATA *gb_pos_complement;
335  int p;
336 
337  gb_assert(pos);
338 
339  gb_pos_start = GB_search(gb_gene, "pos_start", GB_STRING);
340  if (!gb_pos_start) error = GB_await_error();
341 
342  if (!error) {
343  gb_pos_stop = GB_search(gb_gene, "pos_stop", GB_STRING);
344  if (!gb_pos_stop) error = GB_await_error();
345  }
346  if (!error) {
347  gb_pos_complement = GB_search(gb_gene, "pos_complement", GB_STRING);
348  if (!gb_pos_complement) error = GB_await_error();
349  }
350 
351  if (!error) {
352  if (pos->start_uncertain) {
353  if (!gb_pos_certain) {
354  gb_pos_certain = GB_search(gb_gene, "pos_certain", GB_STRING);
355  if (!gb_pos_certain) error = GB_await_error();
356  }
357  }
358  else {
359  if (gb_pos_certain) {
360  error = GB_delete(gb_pos_certain);
361  gb_pos_certain = NULp;
362  }
363  }
364  }
365 
366  // test data
367  if (!error) {
368  size_t length;
369 
370  if (seqLength) {
371  length = seqLength;
372  }
373  else { // unknown -> autodetect
374  GBDATA *gb_organism = GB_get_grandfather(gb_gene);
375  GBDATA *gb_genome = GBT_find_sequence(gb_organism, GENOM_ALIGNMENT);
376 
377  length = GB_read_count(gb_genome);
378  }
379 
380  for (p = 0; p<pos->parts && !error; ++p) {
381  char c;
382 
383  c = pos->complement[p]; gb_assert(c == 0 || c == 1);
384  if (c<0 || c>1) {
385  error = GBS_global_string("Illegal value %i in complement", int(c));
386  }
387  else {
388  if (pos->start_pos[p]>pos->stop_pos[p]) {
389  error = GBS_global_string("Illegal positions (%zu>%zu)", pos->start_pos[p], pos->stop_pos[p]);
390  }
391  else if (pos->start_pos[p] == 0) {
392  error = GBS_global_string("Illegal start position %zu", pos->start_pos[p]);
393  }
394  else if (pos->stop_pos[p] > length) {
395  error = GBS_global_string("Illegal stop position %zu (>length(=%zu))", pos->stop_pos[p], length);
396  }
397  else {
398  if (pos->start_uncertain) {
399  c = pos->start_uncertain[p];
400  char c2 = pos->stop_uncertain[p];
401 
402  if (!c || !strchr("<=>+", c)) error = GBS_global_string("Invalid uncertainty '%c'", c);
403  else if (!c2 || !strchr("<=>-", c2)) error = GBS_global_string("Invalid uncertainty '%c'", c2);
404  else {
405  if (c == '+' || c2 == '-') {
406  if (c == '+' && c2 == '-') {
407  if (pos->start_pos[p] != pos->stop_pos[p]-1) {
408  error = GBS_global_string("Invalid positions %zu^%zu for uncertainties +-", pos->start_pos[p], pos->stop_pos[p]);
409  }
410  }
411  else {
412  error = "uncertainties '+' and '-' can only be used together";
413  }
414  }
415  }
416  }
417  }
418  }
419  }
420  }
421 
422  if (!error) {
423  if (pos->parts == 1) {
424  if (gb_pos_joined) error = GB_delete(gb_pos_joined);
425 
426  if (!error) error = GB_write_string(gb_pos_start, GBS_global_string("%zu", pos->start_pos[0]));
427  if (!error) error = GB_write_string(gb_pos_stop, GBS_global_string("%zu", pos->stop_pos[0]));
428  if (!error) error = GB_write_string(gb_pos_complement, GBS_global_string("%c", pos->complement[0]+'0'));
429 
430  if (!error && gb_pos_certain) {
431  error = GB_write_string(gb_pos_certain, GBS_global_string("%c%c", pos->start_uncertain[0], pos->stop_uncertain[0]));
432  }
433  }
434  else {
435  if (!gb_pos_joined) {
436  gb_pos_joined = GB_search(gb_gene, "pos_joined", GB_INT);
437  if (!gb_pos_joined) error = GB_await_error();
438  }
439  if (!error) error = GB_write_int(gb_pos_joined, pos->parts * (pos->joinable ? 1 : -1)); // neg. parts means not joinable
440 
441  if (!error) {
442  GBS_strstruct *start = GBS_stropen(12*pos->parts);
443  GBS_strstruct *stop = GBS_stropen(12*pos->parts);
444  GBS_strstruct *complement = GBS_stropen(2*pos->parts);
445  GBS_strstruct *uncertain = GBS_stropen(3*pos->parts);
446 
447  for (p = 0; p<pos->parts; ++p) {
448  if (p>0) {
449  GBS_chrcat(start, ',');
450  GBS_chrcat(stop, ',');
451  GBS_chrcat(complement, ',');
452  GBS_chrcat(uncertain, ',');
453  }
454  GBS_strcat(start, GBS_global_string("%zu", pos->start_pos[p]));
455  GBS_strcat(stop, GBS_global_string("%zu", pos->stop_pos[p]));
456  GBS_chrcat(complement, pos->complement[p]+'0');
457  if (gb_pos_certain) {
458  GBS_chrcat(uncertain, pos->start_uncertain[p]);
459  GBS_chrcat(uncertain, pos->stop_uncertain[p]);
460  }
461  }
462 
463  char *sstart = GBS_strclose(start);
464  char *sstop = GBS_strclose(stop);
465  char *scomplement = GBS_strclose(complement);
466  char *suncertain = GBS_strclose(uncertain);
467 
468  error = GB_write_string(gb_pos_start, sstart);
469  if (!error) error = GB_write_string(gb_pos_stop, sstop);
470  if (!error) error = GB_write_string(gb_pos_complement, scomplement);
471  if (!error && gb_pos_certain) error = GB_write_string(gb_pos_certain, suncertain);
472 
473  free(suncertain);
474  free(scomplement);
475  free(sstop);
476  free(sstart);
477  }
478  }
479  }
480 
481  return error;
482 }
483 
485 
486 static int cmp_location_parts(const void *v1, const void *v2) {
487  int i1 = *(int*)v1;
488  int i2 = *(int*)v2;
489 
490  int cmp = location2sort->start_pos[i1]-location2sort->start_pos[i2];
491  if (!cmp) {
492  cmp = location2sort->stop_pos[i1]-location2sort->stop_pos[i2];
493  }
494  return cmp;
495 }
496 
498  // Note: makes location partly invalid (only start_pos + stop_pos are valid afterwards)
499  int parts = location->parts;
500  int *idx = ARB_alloc<int>(parts); // idx[newpos] = oldpos
501  int i, p;
502 
503  for (p = 0; p<parts; ++p) idx[p] = p; // IRRELEVANT_LOOP
504 
505  location2sort = location;
506  qsort(idx, parts, sizeof(*idx), cmp_location_parts);
507  location2sort = NULp;
508 
509  for (p = 0; p<parts; ++p) {
510  i = idx[p];
511 
512 #define swap(a, b, type) do { type tmp = (a); (a) = (b); (b) = (tmp); } while (0)
513 
514  if (i != p) {
515  swap(location->start_pos[i], location->start_pos[p], size_t);
516  swap(location->stop_pos[i], location->stop_pos[p], size_t);
517  swap(idx[i], idx[p], int);
518  }
519  }
520 
521 #if defined(DEBUG) && 0
522  printf("Locations sorted:\n");
523  for (p = 0; p<parts; ++p) {
524  printf(" [%i] %i - %i %i\n", p, location->start_pos[p], location->stop_pos[p], (int)(location->complement[p]));
525  }
526 #endif // DEBUG
527 
528  i = 0;
529  for (p = 1; p<parts; p++) {
530  if ((location->stop_pos[i]+1) >= location->start_pos[p]) {
531  // parts overlap or are directly consecutive
532 
533  location->stop_pos[i] = location->stop_pos[p];
534  }
535  else {
536  i++;
537  location->start_pos[i] = location->start_pos[p];
538  location->stop_pos[i] = location->stop_pos[p];
539  }
540  }
541  location->parts = i+1;
542 
543 #if defined(DEBUG) && 0
544  parts = location->parts;
545  printf("Locations merged:\n");
546  for (p = 0; p<parts; ++p) {
547  printf(" [%i] %i - %i %i\n", p, location->start_pos[p], location->stop_pos[p], (int)(location->complement[p]));
548  }
549 #endif // DEBUG
550 
551  free(idx);
552 }
553 
554 
555 
556 // -----------------------------------------
557 // test if species is pseudo-species
558 
559 const char *GEN_origin_organism(GBDATA *gb_pseudo) {
560  GBDATA *gb_origin = GB_entry(gb_pseudo, "ARB_origin_species");
561  return gb_origin ? GB_read_char_pntr(gb_origin) : NULp;
562 }
563 const char *GEN_origin_gene(GBDATA *gb_pseudo) {
564  GBDATA *gb_origin = GB_entry(gb_pseudo, "ARB_origin_gene");
565  return gb_origin ? GB_read_char_pntr(gb_origin) : NULp;
566 }
567 
569  return GEN_origin_organism(gb_species);
570 }
571 
572 // ------------------------------------------------
573 // find organism or gene for pseudo-species
574 
578 
579  return GB_export_errorf("The gene-species '%s' refers to an unknown organism (%s)\n"
580  "This occurs if you rename or delete the organism or change the entry\n"
581  "'ARB_origin_species' and will most likely cause serious problems.",
582  GBT_get_name_or_description(gb_pseudo),
583  GEN_origin_organism(gb_pseudo));
584 }
585 
586 // @@@ FIXME: missing: GEN_gene_not_found (like GEN_organism_not_found)
587 
588 // ------------------------------
589 // search pseudo species
590 
591 static const char *pseudo_species_hash_key(const char *organism_name, const char *gene_name) {
592  return GBS_global_string("%s*%s", organism_name, gene_name);
593 }
594 
595 static GBDATA *GEN_read_pseudo_species_from_hash(const GB_HASH *pseudo_hash, const char *organism_name, const char *gene_name) {
596  return (GBDATA*)GBS_read_hash(pseudo_hash, pseudo_species_hash_key(organism_name, gene_name));
597 }
598 
599 void GEN_add_pseudo_species_to_hash(GBDATA *gb_pseudo, GB_HASH *pseudo_hash) {
600  const char *organism_name = GEN_origin_organism(gb_pseudo);
601  const char *gene_name = GEN_origin_gene(gb_pseudo);
602 
603  gb_assert(organism_name);
604  gb_assert(gene_name);
605 
606  GBS_write_hash(pseudo_hash, pseudo_species_hash_key(organism_name, gene_name), (long)gb_pseudo);
607 }
608 
610  GB_HASH *pseudo_hash = GBS_create_hash(GBT_get_species_count(gb_main)+additionalSize, GB_IGNORE_CASE);
611  GBDATA *gb_pseudo;
612 
613  for (gb_pseudo = GEN_first_pseudo_species(gb_main);
614  gb_pseudo;
615  gb_pseudo = GEN_next_pseudo_species(gb_pseudo))
616  {
617  GEN_add_pseudo_species_to_hash(gb_pseudo, pseudo_hash);
618  }
619 
620  return pseudo_hash;
621 }
622 
623 GBDATA *GEN_find_pseudo_species(GBDATA *gb_main, const char *organism_name, const char *gene_name, const GB_HASH *pseudo_hash) {
624  // parameter pseudo_hash :
625  // 0 -> use slow direct search [if you only search one]
626  // otherwise it shall be a hash generated by GEN_create_pseudo_species_hash() [if you search several times]
627  // Note : use GEN_add_pseudo_species_to_hash to keep hash up-to-date
628  GBDATA *gb_pseudo;
629 
630  if (pseudo_hash) {
631  gb_pseudo = GEN_read_pseudo_species_from_hash(pseudo_hash, organism_name, gene_name);
632  }
633  else {
634  for (gb_pseudo = GEN_first_pseudo_species(gb_main);
635  gb_pseudo;
636  gb_pseudo = GEN_next_pseudo_species(gb_pseudo))
637  {
638  const char *origin_gene_name = GEN_origin_gene(gb_pseudo);
639  if (strcmp(gene_name, origin_gene_name) == 0) {
640  const char *origin_species_name = GEN_origin_organism(gb_pseudo);
641  if (strcmp(organism_name, origin_species_name) == 0) {
642  break; // found pseudo species
643  }
644  }
645  }
646  }
647  return gb_pseudo;
648 }
649 
650 // -----------------------
651 // search origins
652 
653 GBDATA *GEN_find_origin_organism(GBDATA *gb_pseudo, const GB_HASH *organism_hash) {
654  // parameter organism_hash:
655  // 0 -> use slow direct search [if you only search one or two]
656  // otherwise it shall be a hash generated by GBT_create_organism_hash() [if you search several times]
657  // Note : use GBT_add_item_to_hash() to keep hash up-to-date
658 
659  const char *origin_species_name;
660  GBDATA *gb_organism = NULp;
662 
663  origin_species_name = GEN_origin_organism(gb_pseudo);
664  if (origin_species_name) {
665  if (organism_hash) {
666  gb_organism = (GBDATA*)GBS_read_hash(organism_hash, origin_species_name);
667  }
668  else {
669  gb_organism = GBT_find_species_rel_species_data(GB_get_father(gb_pseudo), origin_species_name);
670  }
671  }
672 
673  return gb_organism;
674 }
675 
676 GBDATA *GEN_find_origin_gene(GBDATA *gb_pseudo, const GB_HASH *organism_hash) {
677  const char *origin_gene_name;
678 
680 
681  origin_gene_name = GEN_origin_gene(gb_pseudo);
682  if (origin_gene_name) {
683  GBDATA *gb_organism = GEN_find_origin_organism(gb_pseudo, organism_hash);
684  gb_assert(gb_organism);
685 
686  return GEN_find_gene(gb_organism, origin_gene_name);
687  }
688  return NULp;
689 }
690 
691 // --------------------------------
692 // find pseudo-species
693 
695  GBDATA *gb_species = GBT_first_species(gb_main);
696 
697  if (!gb_species || GEN_is_pseudo_gene_species(gb_species)) return gb_species;
698  return GEN_next_pseudo_species(gb_species);
699 }
700 
702  if (gb_species) {
703  while (1) {
704  gb_species = GBT_next_species(gb_species);
705  if (!gb_species || GEN_is_pseudo_gene_species(gb_species)) break;
706  }
707  }
708  return gb_species;
709 }
710 
712  if (gb_species) {
713  while (1) {
714  gb_species = GBT_next_marked_species(gb_species);
715  if (!gb_species || GEN_is_pseudo_gene_species(gb_species)) break;
716  }
717  }
718  return gb_species;
719 }
720 
722  GBDATA *gb_species = GBT_first_marked_species(gb_main);
723 
724  if (!gb_species || GEN_is_pseudo_gene_species(gb_species)) return gb_species;
725  return GEN_next_marked_pseudo_species(gb_species);
726 }
727 
728 // ------------------
729 // organisms
730 
731 bool GEN_is_organism(GBDATA *gb_species) {
732  gb_assert(GEN_is_genome_db(GB_get_root(gb_species), -1)); // assert this is a genome db
733  // otherwise it is an error to use GEN_is_organism (or its callers)!!!!
734 
735  return GB_entry(gb_species, GENOM_ALIGNMENT);
736 }
737 
738 GBDATA *GEN_find_organism(GBDATA *gb_main, const char *name) {
739  GBDATA *gb_orga = GBT_find_species(gb_main, name);
740  if (gb_orga) {
741  if (!GEN_is_organism(gb_orga)) {
742  fprintf(stderr, "ARBDB-warning: found unspecific species named '%s', but expected an 'organism' with that name\n", name);
743  gb_orga = NULp;
744  }
745  }
746  return gb_orga;
747 }
748 
750  GBDATA *gb_organism = GBT_first_species(gb_main);
751 
752  if (!gb_organism || GEN_is_organism(gb_organism)) return gb_organism;
753  return GEN_next_organism(gb_organism);
754 }
756  if (gb_organism) {
757  while (1) {
758  gb_organism = GBT_next_species(gb_organism);
759  if (!gb_organism || GEN_is_organism(gb_organism)) break;
760  }
761  }
762  return gb_organism;
763 
764 }
765 
767  long count = 0;
768  GBDATA *gb_organism = GEN_first_organism(gb_main);
769  while (gb_organism) {
770  count++;
771  gb_organism = GEN_next_organism(gb_organism);
772  }
773  return count;
774 }
775 
776 
778  GBDATA *gb_organism = GBT_first_marked_species(gb_main);
779 
780  if (!gb_organism || GEN_is_organism(gb_organism)) return gb_organism;
781  return GEN_next_marked_organism(gb_organism);
782 }
784  if (gb_organism) {
785  while (1) {
786  gb_organism = GBT_next_marked_species(gb_organism);
787  if (!gb_organism || GEN_is_organism(gb_organism)) break;
788  }
789  }
790  return gb_organism;
791 }
792 
793 char *GEN_global_gene_identifier(GBDATA *gb_gene, GBDATA *gb_organism) {
794  if (!gb_organism) {
795  gb_organism = GB_get_grandfather(gb_gene);
796  gb_assert(gb_organism);
797  }
798 
800 }
801 
802 // --------------------------------------------------------------------------------
803 
804 #ifdef UNIT_TESTS
805 #include <test_unit.h>
806 #include <arb_unit_test.h>
807 #include <arb_defs.h>
808 
809 static struct arb_unit_test::test_alignment_data TestAlignmentData_Genome[] = {
810  { 0, "spec", "AUCUCCUAAACCCAACCGUAGUUCGAAUUGAG" },
811 };
812 
813 #define TEST_EXPECT_MEMBER_EQUAL(s1,s2,member) TEST_EXPECT_EQUAL((s1)->member, (s2)->member)
814 
815 #define TEST_EXPECT_GENPOS_EQUAL(p1,p2) do { \
816  TEST_EXPECT_MEMBER_EQUAL(p1, p2, parts); \
817  TEST_EXPECT_MEMBER_EQUAL(p1, p2, joinable); \
818  for (int p = 0; p<(p1)->parts; ++p) { \
819  TEST_EXPECT_MEMBER_EQUAL(p1, p2, start_pos[p]); \
820  TEST_EXPECT_MEMBER_EQUAL(p1, p2, stop_pos[p]); \
821  TEST_EXPECT_MEMBER_EQUAL(p1, p2, complement[p]); \
822  if ((p1)->start_uncertain) { \
823  TEST_EXPECT_MEMBER_EQUAL(p1, p2, start_uncertain[p]); \
824  TEST_EXPECT_MEMBER_EQUAL(p1, p2, stop_uncertain[p]); \
825  } \
826  } \
827  } while(0)
828 
829 #define TEST_WRITE_READ_GEN_POSITION(pos) \
830  do { \
831  error = GEN_write_position(gb_gene, (pos), 0); \
832  if (!error) { \
833  GEN_position *rpos = GEN_read_position(gb_gene); \
834  if (!rpos) { \
835  error = GB_await_error(); \
836  } \
837  else { \
838  TEST_EXPECT_GENPOS_EQUAL((pos), rpos); \
839  GEN_free_position(rpos); \
840  } \
841  } \
842  TEST_EXPECT_NULL(error.deliver()); \
843  } while(0)
844 
845 #define TEST_WRITE_GEN_POSITION_ERROR(pos,exp_error) do { \
846  error = GEN_write_position(gb_gene, &*(pos), 0); \
847  TEST_EXPECT_EQUAL(error.deliver(), exp_error); \
848  } while(0)
849 
850 #define TEST_GENPOS_FIELD(field,value) do { \
851  GBDATA *gb_field = GB_entry(gb_gene, (field)); \
852  if ((value)) { \
853  TEST_REJECT_NULL(gb_field); \
854  TEST_EXPECT_EQUAL(GB_read_char_pntr(gb_field), (value)); \
855  } \
856  else { \
857  TEST_EXPECT_NULL(gb_field); \
858  } \
859  } while(0)
860 
861 #define TEST_GENPOS_FIELDS(start,stop,complement,certain) do { \
862  TEST_GENPOS_FIELD("pos_start", start); \
863  TEST_GENPOS_FIELD("pos_stop", stop); \
864  TEST_GENPOS_FIELD("pos_complement", complement); \
865  TEST_GENPOS_FIELD("pos_certain", certain); \
866  } while(0)
867 
868 #define TEST_GENE_SEQ_AND_LENGTH(werr,wseq,wlen) do { \
869  size_t len; \
870  char *seq = GBT_read_gene_sequence_and_length(gb_gene, true, '-', &len); \
871  TEST_EXPECT_EQUAL(GB_have_error(), werr); \
872  if (seq) { \
873  TEST_EXPECT_EQUAL(len, (size_t)(wlen)); \
874  TEST_EXPECT_EQUAL(seq, (wseq)); \
875  free(seq); \
876  } \
877  else { \
878  GB_clear_error(); \
879  } \
880  } while(0)
881 
882 void TEST_GEN_position() {
883  // see also ../GENOM_IMPORT/Location.cxx@TEST_gene_location
884 
885  GB_shell shell;
887  GBDATA *gb_main = TEST_CREATE_DB(error, "ali_genom", TestAlignmentData_Genome, false);
888 
889  TEST_EXPECT_NULL(error.deliver());
890 
891  {
892  GB_transaction ta(gb_main);
893 
894  GBDATA *gb_organism = GBT_find_species(gb_main, "spec"); TEST_REJECT_NULL(gb_organism);
895  GBDATA *gb_gene_data = GEN_findOrCreate_gene_data(gb_organism); TEST_REJECT_NULL(gb_gene_data);
896  GBDATA *gb_gene = GEN_create_nonexisting_gene_rel_gene_data(gb_gene_data, "gene"); TEST_REJECT_NULL(gb_gene);
897 
898  typedef SmartCustomPtr(GEN_position, GEN_free_position) GEN_position_Ptr;
899  GEN_position_Ptr pos;
900 
901  pos = GEN_new_position(1, false);
902 
903  TEST_WRITE_GEN_POSITION_ERROR(pos, "Illegal start position 0");
904 
905  pos->start_pos[0] = 5;
906  pos->stop_pos[0] = 10;
907  pos->complement[0] = 1;
908 
909  GEN_use_uncertainties(&*pos);
910 
911  TEST_WRITE_READ_GEN_POSITION(&*pos);
912  TEST_GENPOS_FIELDS("5", "10", "1", "==");
913 
914  TEST_GENE_SEQ_AND_LENGTH(false, "TTTAGG", 6);
915 
916  // ----------
917 
918  pos = GEN_new_position(3, false);
919 
920  TEST_WRITE_GEN_POSITION_ERROR(pos, "Illegal start position 0");
921 
922  GEN_use_uncertainties(&*pos);
923 
924  pos->start_pos[0] = 5; pos->start_pos[1] = 10; pos->start_pos[2] = 25;
925  pos->stop_pos[0] = 15; pos->stop_pos[1] = 20; pos->stop_pos[2] = 25;
926  pos->complement[0] = 0; pos->complement[1] = 1; pos->complement[2] = 0;
927 
928  pos->start_uncertain[0] = '<';
929  pos->stop_uncertain[2] = '>';
930 
931  TEST_WRITE_READ_GEN_POSITION(&*pos);
932  TEST_GENPOS_FIELDS("5,10,25", "15,20,25", "0,1,0", "<=,==,=>");
933 
934  TEST_GENE_SEQ_AND_LENGTH(false, "CCUAAACCCAA-TACGGTTGGGT-G", 25);
935 
936  pos->stop_uncertain[2] = 'x';
937  TEST_WRITE_GEN_POSITION_ERROR(pos, "Invalid uncertainty 'x'");
938 
939  pos->stop_uncertain[2] = '+';
940  TEST_WRITE_GEN_POSITION_ERROR(pos, "Invalid uncertainty '+'"); // invalid for stop
941 
942  pos->start_uncertain[2] = '+';
943  pos->stop_uncertain[2] = '-';
944  TEST_WRITE_GEN_POSITION_ERROR(pos, "Invalid positions 25^25 for uncertainties +-");
945 
946  pos->stop_pos[2] = 26;
947  TEST_WRITE_GEN_POSITION_ERROR(pos, (void*)NULp);
948 
949  pos->stop_pos[0] = 100;
950  TEST_WRITE_GEN_POSITION_ERROR(pos, "Illegal stop position 100 (>length(=32))");
951  }
952 
953  GB_close(gb_main);
954 }
955 
956 #endif // UNIT_TESTS
957 
GBDATA * GEN_create_nonexisting_gene(GBDATA *gb_species, const char *name)
Definition: adGene.cxx:96
const char * GB_ERROR
Definition: arb_core.h:25
GBDATA * GEN_next_pseudo_species(GBDATA *gb_species)
Definition: adGene.cxx:701
static GBDATA * GEN_read_pseudo_species_from_hash(const GB_HASH *pseudo_hash, const char *organism_name, const char *gene_name)
Definition: adGene.cxx:595
GBDATA * GEN_first_gene(GBDATA *gb_species)
Definition: adGene.cxx:130
unsigned char * complement
Definition: adGene.h:41
GBDATA * GBT_first_marked_species(GBDATA *gb_main)
Definition: aditem.cxx:113
size_t size() const
Definition: arb_strarray.h:85
long GB_read_int(GBDATA *gbd)
Definition: arbdb.cxx:723
static GB_ERROR parsePositions(GBDATA *gb_gene, const char *field_name, int parts_expected, size_t *results, ConstStrArray &parseTable)
Definition: adGene.cxx:232
long GBS_write_hash(GB_HASH *hs, const char *key, long val)
Definition: adhash.cxx:457
GB_ERROR GEN_write_position(GBDATA *gb_gene, const GEN_position *pos, long seqLength)
Definition: adGene.cxx:325
GB_ERROR GB_write_string(GBDATA *gbd, const char *s)
Definition: arbdb.cxx:1385
GBDATA * GEN_find_origin_organism(GBDATA *gb_pseudo, const GB_HASH *organism_hash)
Definition: adGene.cxx:653
static GEN_position * lastFreedPosition
Definition: adGene.cxx:153
const char * GEN_origin_organism(GBDATA *gb_pseudo)
Definition: adGene.cxx:559
GBDATA * GBT_expect_item_rel_item_data(GBDATA *gb_item_data, const char *id_field, const char *id_value)
Definition: aditem.cxx:91
GBDATA * GB_nextEntry(GBDATA *entry)
Definition: adquery.cxx:339
GB_ERROR GB_end_transaction(GBDATA *gbd, GB_ERROR error)
Definition: arbdb.cxx:2549
bool GEN_is_organism(GBDATA *gb_species)
Definition: adGene.cxx:731
GBDATA * GEN_expect_gene(GBDATA *gb_species, const char *name)
Definition: adGene.cxx:72
int parts
Definition: adGene.h:37
const char * GBS_global_string(const char *templat,...)
Definition: arb_msg.cxx:204
void GBK_terminatef(const char *templat,...)
Definition: arb_msg.cxx:477
#define swap(a, b, type)
char * GEN_global_gene_identifier(GBDATA *gb_gene, GBDATA *gb_organism)
Definition: adGene.cxx:793
GEN_position * GEN_new_position(int parts, bool joinable)
Definition: adGene.cxx:155
GBDATA * GEN_find_origin_gene(GBDATA *gb_pseudo, const GB_HASH *organism_hash)
Definition: adGene.cxx:676
GBDATA * GEN_next_marked_organism(GBDATA *gb_organism)
Definition: adGene.cxx:783
GBDATA * GB_get_grandfather(GBDATA *gbd)
Definition: arbdb.cxx:1726
GBDATA * GB_get_father(GBDATA *gbd)
Definition: arbdb.cxx:1720
GB_ERROR GB_push_transaction(GBDATA *gbd)
Definition: arbdb.cxx:2482
static struct GEN_position_mem_handler GEN_position_dealloc
GBDATA * GEN_first_marked_pseudo_species(GBDATA *gb_main)
Definition: adGene.cxx:721
static GB_ERROR parseCSV(GBDATA *gb_gene, const char *field_name, size_t parts_expected, ConstStrArray &parseTable)
Definition: adGene.cxx:211
GB_ERROR GB_delete(GBDATA *&source)
Definition: arbdb.cxx:1904
static HelixNrInfo * start
static const char * pseudo_species_hash_key(const char *organism_name, const char *gene_name)
Definition: adGene.cxx:591
GB_ERROR GB_export_error(const char *error)
Definition: arb_msg.cxx:259
GBS_strstruct * GBS_stropen(long init_size)
Definition: arb_strbuf.cxx:39
GB_ERROR GB_await_error()
Definition: arb_msg.cxx:353
GBDATA * GB_create_container(GBDATA *father, const char *key)
Definition: arbdb.cxx:1827
long GB_read_count(GBDATA *gbd)
Definition: arbdb.cxx:752
GB_ERROR deliver() const
Definition: arb_error.h:114
long GEN_get_organism_count(GBDATA *gb_main)
Definition: adGene.cxx:766
void GEN_add_pseudo_species_to_hash(GBDATA *gb_pseudo, GB_HASH *pseudo_hash)
Definition: adGene.cxx:599
GBDATA * GB_create(GBDATA *father, const char *key, GB_TYPES type)
Definition: arbdb.cxx:1779
GBDATA * GEN_next_marked_gene(GBDATA *gb_gene)
Definition: adGene.cxx:146
unsigned char * start_uncertain
Definition: adGene.h:53
static int cmp_location_parts(const void *v1, const void *v2)
Definition: adGene.cxx:486
static GEN_position * location2sort
Definition: adGene.cxx:484
GBDATA * GEN_first_gene_rel_gene_data(GBDATA *gb_gene_data)
Definition: adGene.cxx:134
const char * GEN_origin_gene(GBDATA *gb_pseudo)
Definition: adGene.cxx:563
#define false
Definition: ureadseq.h:13
GB_HASH * GEN_create_pseudo_species_hash(GBDATA *gb_main, long additionalSize)
Definition: adGene.cxx:609
GBDATA * GEN_find_gene_rel_gene_data(GBDATA *gb_gene_data, const char *name)
Definition: adGene.cxx:60
size_t * stop_pos
Definition: adGene.h:40
#define TEST_REJECT_NULL(n)
Definition: test_unit.h:1310
void GBS_strcat(GBS_strstruct *strstr, const char *ptr)
Definition: arb_strbuf.cxx:108
static void error(const char *msg)
Definition: mkptypes.cxx:96
GBDATA * GB_get_root(GBDATA *gbd)
Definition: arbdb.cxx:1738
bool GEN_is_pseudo_gene_species(GBDATA *gb_species)
Definition: adGene.cxx:568
GB_ERROR GEN_organism_not_found(GBDATA *gb_pseudo)
Definition: adGene.cxx:575
void GBT_splitNdestroy_string(ConstStrArray &names, char *&namelist, const char *separator, bool dropEmptyTokens)
bool GEN_is_genome_db(GBDATA *gb_main, int default_value)
Definition: adGene.cxx:20
GBDATA * GEN_first_organism(GBDATA *gb_main)
Definition: adGene.cxx:749
GBDATA * GBT_next_marked_species(GBDATA *gb_species)
Definition: aditem.cxx:116
GEN_position * GEN_read_position(GBDATA *gb_gene)
Definition: adGene.cxx:250
GBDATA * GEN_next_gene(GBDATA *gb_gene)
Definition: adGene.cxx:138
void GEN_use_uncertainties(GEN_position *pos)
Definition: adGene.cxx:184
#define cmp(h1, h2)
Definition: admap.cxx:50
GBDATA * GBT_find_species_rel_species_data(GBDATA *gb_species_data, const char *name)
Definition: aditem.cxx:133
void GEN_free_position(GEN_position *pos)
Definition: adGene.cxx:195
GBDATA * GEN_first_marked_organism(GBDATA *gb_main)
Definition: adGene.cxx:777
bool GB_has_key(GBDATA *gbd, const char *key)
Definition: arbdb.cxx:1705
Definition: arbdb.h:86
#define GENOM_ALIGNMENT
Definition: adGene.h:19
GBDATA * GBT_find_sequence(GBDATA *gb_species, const char *aliname)
Definition: adali.cxx:670
GB_ERROR GB_write_int(GBDATA *gbd, long i)
Definition: arbdb.cxx:1244
GBDATA * GEN_find_organism(GBDATA *gb_main, const char *name)
Definition: adGene.cxx:738
#define GENOM_DB_TYPE
Definition: adGene.h:20
void GBS_chrcat(GBS_strstruct *strstr, char ch)
Definition: arb_strbuf.cxx:119
void GEN_sortAndMergeLocationParts(GEN_position *location)
Definition: adGene.cxx:497
GB_ERROR GB_export_errorf(const char *templat,...)
Definition: arb_msg.cxx:264
static GBDATA * GEN_create_nonexisting_gene_rel_gene_data(GBDATA *gb_gene_data, const char *name)
Definition: adGene.cxx:78
TYPE * ARB_calloc(size_t nelem)
Definition: arb_mem.h:81
GBDATA * GEN_findOrCreate_gene_data(GBDATA *gb_species)
Definition: adGene.cxx:44
#define TEST_EXPECT_NULL(n)
Definition: test_unit.h:1307
#define gb_assert(cond)
Definition: arbdbt.h:11
static GBDATA * GEN_next_marked_pseudo_species(GBDATA *gb_species)
Definition: adGene.cxx:711
GBDATA * GEN_find_gene(GBDATA *gb_species, const char *name)
Definition: adGene.cxx:64
GBDATA * GB_next_marked(GBDATA *gbd, const char *keystring)
Definition: adquery.cxx:670
#define assert_or_exit(cond)
Definition: arb_assert.h:263
GB_ERROR GBT_write_string(GBDATA *gb_container, const char *fieldpath, const char *content)
Definition: adtools.cxx:451
GBDATA * GEN_find_pseudo_species(GBDATA *gb_main, const char *organism_name, const char *gene_name, const GB_HASH *pseudo_hash)
Definition: adGene.cxx:623
char * GB_read_string(GBDATA *gbd)
Definition: arbdb.cxx:903
char * GBS_strclose(GBS_strstruct *strstr)
Definition: arb_strbuf.cxx:69
GBDATA * GBT_first_species(GBDATA *gb_main)
Definition: aditem.cxx:124
typedef SmartCustomPtr(GEN_position, GEN_free_position) GEN_positionPtr
GBDATA * GEN_find_or_create_gene_rel_gene_data(GBDATA *gb_gene_data, const char *name)
Definition: adGene.cxx:100
size_t * start_pos
Definition: adGene.h:39
GBDATA * GB_find_string(GBDATA *gbd, const char *key, const char *str, GB_CASE case_sens, GB_SEARCH_TYPE gbs)
Definition: adquery.cxx:302
GBDATA * GBT_next_species(GBDATA *gb_species)
Definition: aditem.cxx:128
#define NULp
Definition: cxxforward.h:97
GBDATA * GBT_find_species(GBDATA *gb_main, const char *name)
Definition: aditem.cxx:139
void reserve(size_t forElems)
Definition: arb_strarray.h:82
GBDATA * GEN_first_marked_gene(GBDATA *gb_species)
Definition: adGene.cxx:143
GBDATA * GEN_find_gene_data(GBDATA *gb_species)
Definition: adGene.cxx:50
GBDATA * GB_first_marked(GBDATA *gbd, const char *keystring)
Definition: adquery.cxx:655
long GBT_get_species_count(GBDATA *gb_main)
Definition: aditem.cxx:207
GBDATA * GBT_find_item_rel_item_data(GBDATA *gb_item_data, const char *id_field, const char *id_value)
Definition: aditem.cxx:74
GBDATA * GEN_first_pseudo_species(GBDATA *gb_main)
Definition: adGene.cxx:694
GB_transaction ta(gb_var)
bool joinable
Definition: adGene.h:38
Definition: trnsprob.h:20
GB_CSTR GB_read_char_pntr(GBDATA *gbd)
Definition: arbdb.cxx:898
GBDATA * gb_main
Definition: adname.cxx:33
GBDATA * GB_search(GBDATA *gbd, const char *fieldpath, GB_TYPES create)
Definition: adquery.cxx:531
GB_CSTR GBT_get_name_or_description(GBDATA *gb_item)
Definition: aditem.cxx:441
size_t length
long GBS_read_hash(const GB_HASH *hs, const char *key)
Definition: adhash.cxx:395
GBDATA * GB_entry(GBDATA *father, const char *key)
Definition: adquery.cxx:334
GBDATA * GEN_next_organism(GBDATA *gb_organism)
Definition: adGene.cxx:755
char * GBS_global_string_copy(const char *templat,...)
Definition: arb_msg.cxx:195
void GB_close(GBDATA *gbd)
Definition: arbdb.cxx:649
GBDATA * GEN_expect_gene_data(GBDATA *gb_species)
Definition: adGene.cxx:54
GB_HASH * GBS_create_hash(long estimated_elements, GB_CASE case_sens)
Definition: adhash.cxx:253
unsigned char * stop_uncertain
Definition: adGene.h:54
Definition: arbdb.h:66