20 #include <sys/types.h>
22 #define gp_assert(cond) arb_assert(cond)
38 bool operator()(
const char *name1,
const char *name2)
const {
42 #if defined(UNIT_TESTS) // UT_DIFF
43 return strcmp(name1, name2)<0;
45 return (name1-name2)<0;
62 void check_legal()
const {
76 int length()
const {
return end-begin+1; }
83 return ! ((end < other.
begin) || (other.
end < begin));
86 #if defined(DUMP_OVERLAP_CALC)
87 void dump(
const char *note)
const {
88 printf(
"%s begin=%i end=%i\n", note, begin, end);
90 #endif // DUMP_OVERLAP_CALC
105 typedef set<PositionPair, ltNonOverlap> OverlappingGeneSet;
107 OverlappingGeneSet usedRanges;
108 unsigned long overlapSize;
109 unsigned long geneSize;
118 #if defined(DUMP_OVERLAP_CALC)
120 #endif // DUMP_OVERLAP_CALC
127 OverlappingGeneSet::iterator found = usedRanges.find(gene);
128 if (found == usedRanges.end()) {
129 usedRanges.insert(gene);
133 int gene_length = gene.
length();
139 int combined_length = gene.length();
141 size_t overlap = (found->length()+gene_length)-combined_length;
142 overlapSize += overlap;
143 geneSize += gene_length;
145 usedRanges.erase(found);
147 gene_length = combined_length;
148 found = usedRanges.find(gene);
149 }
while (found != usedRanges.end());
151 usedRanges.insert(gene);
156 OverlappingGeneSet::iterator end = usedRanges.end();
157 OverlappingGeneSet::iterator curr = usedRanges.begin();
158 OverlappingGeneSet::iterator prev = end;
161 intergeneList.push_back(wholeGenome);
164 if (curr->begin > wholeGenome.
begin) {
170 while (curr != end) {
171 if (prev->end < curr->begin) {
172 if (prev->end != (curr->begin-1)) {
173 intergeneList.push_back(
PositionPair(prev->end+1, curr->begin-1));
177 return "Internal error: Overlapping gene ranges";
183 if (prev != end && prev->end < wholeGenome.
end) {
190 #if defined(DUMP_OVERLAP_CALC)
191 void GenePositionMap::dump()
const {
192 printf(
"List of ranges used by genes:\n");
193 for (OverlappingGeneSet::iterator g = usedRanges.begin(); g != usedRanges.end(); ++g) {
196 printf(
"Overlap: %lu bases\n", getOverlap());
198 #endif // DUMP_OVERLAP_CALC
204 char *gene_sequence =
new char[seqlen+1];
206 memcpy(gene_sequence, sequence, seqlen);
207 gene_sequence[seqlen] = 0;
213 delete [] gene_sequence;
222 if (*name ==
'\\') ++name;
227 #define CHECK_SEMI_ESCAPED(s)
235 const char *firstSem = strchr(long_name,
';');
258 names[static_internal_name] = long_name;
268 error =
GBS_global_string(
"%s (internal_name='%s', long_name='%s')", error, internal_name, long_name);
277 GBDATA *gb_genespecies =
create_gene_species(gb_species_data2, internal_name, long_name, start_pos, ali_genome+start_pos, end_pos-start_pos+1);
282 if (start_pos <= end_pos) {
283 char internal_name[128];
284 sprintf(internal_name,
"i%x", intergene_counter++);
285 return create_genelike_entry(internal_name, gb_species_data2, start_pos, end_pos, ali_genome, long_gene_name);
287 return "Illegal inter-gene positions (start behind end)";
291 if (start_pos <= end_pos) {
292 char internal_name[128];
293 sprintf(internal_name,
"n%x", gene_counter++);
294 return create_genelike_entry(internal_name, gb_species_data2, start_pos, end_pos, ali_genome, long_gene_name);
296 return "Illegal gene positions (start behind end)";
301 PositionPairList::iterator list_end = part_list.end();
304 for (PositionPairList::iterator part = part_list.begin(); part != list_end; ++part) {
305 int part_size = part->end-part->begin+1;
307 gene_size += part_size;
310 char *gene_sequence =
new char[gene_size+1];
313 char *split_pos_list =
NULp;
315 for (PositionPairList::iterator part = part_list.begin(); part != list_end;) {
316 int part_size = part->end-part->begin+1;
317 int genome_pos = part->begin;
318 memcpy(gene_sequence+gene_off, ali_genome+part->begin, part_size);
319 gene_off += part_size;
323 if (!split_pos_list) {
327 char *next_split_pos_list;
328 if (part != list_end) {
334 freeset(split_pos_list, next_split_pos_list);
338 char internal_name[128];
339 sprintf(internal_name,
"s%x", split_gene_counter++);
343 gene_sequence, first_part.
end-first_part.
begin+1);
347 #if defined(DEBUG) && 0
348 printf(
"split gene: long_gene_name='%s' internal_name='%s' split_pos_list='%s'\n",
349 long_gene_name, internal_name, split_pos_list);
354 free(split_pos_list);
355 delete [] gene_sequence;
367 int parts = location->
parts;
368 for (
int p = 0; p<parts; ++p) {
396 if (!organism_name && !error) {
397 error =
"encountered invalid organism (lacks 'name' entry)";
409 if (!error && !gene_name) error =
"encountered invalid gene (lacks 'name' entry)";
410 if (!error && part_list.empty()) error =
"empty position list";
412 int split_count = part_list.size();
418 if (split_count == 1) {
419 error =
create_gene(gb_species_data2, first_part.
begin, first_part.
end, ali_genom, long_gene_name);
423 error =
create_split_gene(gb_species_data2, part_list, ali_genom, long_gene_name);
425 for (PositionPairList::iterator p = part_list.begin(); p != part_list.end(); ++p) {
429 free(long_gene_name);
434 if (error && gene_name) error =
GBS_global_string(
"in gene '%s': %s", gene_name, error);
439 PositionPair wholeGenome(0, PositionPair::genome_length-1);
442 for (PositionPairList::iterator i = intergenes.begin(); !error && i != intergenes.end(); ++i) {
443 char *long_intergene_name =
GBS_global_string_copy(
"%s;intergene_%i_%i", organism_name, i->begin, i->end);
444 error =
create_intergene(gb_species_data2, i->begin, i->end, ali_genom, long_intergene_name);
445 free(long_intergene_name);
449 if (error && organism_name) error =
GBS_global_string(
"in organism '%s': %s", organism_name, error);
452 int new_genes = gene_counter-gene_counter_old;
453 int new_split_genes = split_gene_counter-split_gene_counter_old;
454 int new_intergenes = intergene_counter-intergene_counter_old;
457 unsigned long overlaps = geneRanges.
getOverlap();
458 double data_grow = overlaps/double(PositionPair::genome_length)*100;
459 double gene_overlap = overlaps/double(genesSize)*100;
461 if (new_split_genes) {
463 printf(
" - %s: %i genes (%i split), %i intergenes",
464 organism_name, new_genes+new_split_genes, new_split_genes, new_intergenes);
467 printf(
" - %s: %i genes, %i intergenes",
468 organism_name, new_genes, new_intergenes);
470 printf(
" (data grow: %5.2f%%, gene overlap: %5.2f%%=%lu bp)\n", data_grow, gene_overlap, overlaps);
473 #if defined(DUMP_OVERLAP_CALC)
475 #endif // DUMP_OVERLAP_CALC
483 "arb_gene_probe 1.2 -- (C) 2003/2004 The ARB-project\n"
484 "written by Tom Littschwager, Bernd Spanfelner, Conny Wolf, Ralf Westram.\n");
487 printf(
"Usage: arb_gene_probe input_database output_database\n");
488 printf(
" Prepares a genome database for Gene-PT-Server\n");
492 const char *inputname = argv[1];
493 const char *outputname = argv[2];
497 printf(
"Converting '%s' -> '%s' ..\n", inputname, outputname);
514 int non_ali_genom_species = 0;
515 int ali_genom_species = 0;
518 gb_species && !
error;
524 ++non_ali_genom_species;
532 if (non_ali_genom_species) {
533 printf(
"%i species had no alignment in '" GENOM_ALIGNMENT "' and have been skipped.\n", non_ali_genom_species);
535 if (!error && ali_genom_species == 0) {
536 error =
"no species with data in alignment '" GENOM_ALIGNMENT "' were found";
541 "Found %i genes (%i were split) and %i intergene regions.\n",
542 ali_genom_species, gene_counter, split_gene_counter, intergene_counter);
553 FullNameMap::iterator NameEnd =
names.end();
554 FullNameMap::iterator NameIter;
557 for (NameIter =
names.begin(); NameIter != NameEnd; ++NameIter) {
558 mapsize += strlen(NameIter->first)+NameIter->second.length()+2;
561 map_string =
new char[mapsize+1];
564 for (NameIter =
names.begin(); NameIter != NameEnd; ++NameIter) {
565 int len1 = strlen(NameIter->first);
566 int len2 = NameIter->second.length();
568 memcpy(map_string+moff, NameIter->first, len1);
569 map_string[moff+len1] =
';';
572 memcpy(map_string+moff, NameIter->second.c_str(), len2);
573 map_string[moff+len2] =
';';
576 map_string[moff] = 0;
585 delete [] map_string;
605 printf(
"Saving '%s' ..\n", outputname);
606 error =
GB_save_as(gb_main, outputname,
"bfm");
607 if (error) unlink(outputname);
614 printf(
"Error in arb_gene_probe: %s\n", error);
618 printf(
"arb_gene_probe done.\n");
GB_ERROR GB_begin_transaction(GBDATA *gbd)
static GB_ERROR create_gene(GBDATA *gb_species_data2, int start_pos, int end_pos, const char *ali_genome, const char *long_gene_name)
static int split_gene_counter
GBDATA * GB_open(const char *path, const char *opent)
static GB_ERROR create_data_entry(GBDATA *gb_species2, const char *sequence, int seqlen)
GBDATA * GEN_next_gene(GBDATA *gb_gene)
unsigned long getAllGeneSize() const
GB_ERROR GB_write_string(GBDATA *gbd, const char *s)
void announceGene(PositionPair gene)
void GEN_free_position(GEN_position *pos)
GB_ERROR GB_end_transaction(GBDATA *gbd, GB_ERROR error)
static GB_ERROR create_genelike_entry(const char *internal_name, GBDATA *gb_species_data2, int start_pos, int end_pos, const char *ali_genome, const char *long_name)
const char * GBS_global_string(const char *templat,...)
static GB_ERROR scan_gene_positions(GBDATA *gb_gene, PositionPairList &part_list)
void GEN_sortAndMergeLocationParts(GEN_position *location)
char * GBS_escape_string(const char *str, const char *chars_to_escape, char escape_char)
static int intergene_counter
map< const char *, string, nameOrder > FullNameMap
GB_ERROR GB_push_transaction(GBDATA *gbd)
GB_ERROR GB_delete(GBDATA *&source)
GBDATA * GBT_first_species_rel_species_data(GBDATA *gb_species_data)
PositionPair(int begin_, int end_)
#define CHECK_SEMI_ESCAPED(s)
static GB_ERROR insert_genes_of_organism(GBDATA *gb_organism, GBDATA *gb_species_data2)
GB_ERROR GB_export_error(const char *error)
GB_ERROR GB_await_error()
GBDATA * GB_create_container(GBDATA *father, const char *key)
long GB_read_count(GBDATA *gbd)
GB_ERROR buildIntergeneList(const PositionPair &wholeGenome, PositionPairList &intergeneList) const
bool overlapsWith(const PositionPair &other) const
GBDATA * GB_create(GBDATA *father, const char *key, GB_TYPES type)
GB_ERROR GBT_set_default_alignment(GBDATA *gb_main, const char *alignment_name)
GB_ERROR GB_save_as(GBDATA *gbd, const char *path, const char *savetype)
static void error(const char *msg)
static GB_ERROR create_split_gene(GBDATA *gb_species_data2, PositionPairList &part_list, const char *ali_genome, const char *long_gene_name)
GBDATA * GBT_find_sequence(GBDATA *gb_species, const char *aliname)
list< PositionPair > PositionPairList
int ARB_main(int argc, char *argv[])
unsigned long getOverlap() const
GB_ERROR GBT_write_string(GBDATA *gb_container, const char *fieldpath, const char *content)
GBDATA * GBT_create(GBDATA *father, const char *key, long delete_level)
GBDATA * GEN_first_gene(GBDATA *gb_species)
GBDATA * GBT_next_species(GBDATA *gb_species)
const char * GBT_get_name(GBDATA *gb_item)
GB_ERROR GB_request_undo_type(GBDATA *gb_main, GB_UNDO_TYPE type) __ATTR__USERESULT_TODO
GB_ERROR GBT_write_int(GBDATA *gb_container, const char *fieldpath, long content)
GB_CSTR GB_read_char_pntr(GBDATA *gbd)
static GBDATA * create_gene_species(GBDATA *gb_species_data2, const char *internal_name, const char *long_name, int abspos, const char *sequence, int length)
GBDATA * GB_search(GBDATA *gbd, const char *fieldpath, GB_TYPES create)
static GB_ERROR create_intergene(GBDATA *gb_species_data2, int start_pos, int end_pos, const char *ali_genome, const char *long_gene_name)
GEN_position * GEN_read_position(GBDATA *gb_gene)
bool operator()(const char *name1, const char *name2) const
char * GBS_global_string_copy(const char *templat,...)
void GB_close(GBDATA *gbd)
GBDATA * GBT_get_species_data(GBDATA *gb_main)