ARB
GDE_arbdb_io.cxx
Go to the documentation of this file.
1 #include "GDE_proto.h"
2 
3 #include <aw_msg.hxx>
4 #include <AW_rename.hxx>
5 #include <AP_filter.hxx>
6 #include <aw_awar_defs.hxx>
7 #include <arb_progress.h>
8 #include <arb_global_defs.h>
9 
10 #include <algorithm>
11 
12 // AISC_MKPT_PROMOTE:#ifndef GDE_EXTGLOB_H
13 // AISC_MKPT_PROMOTE:#include "GDE_extglob.h"
14 // AISC_MKPT_PROMOTE:#endif
15 
16 typedef unsigned int UINT;
17 
19  int curelem = dataset.numelements++;
20  if (curelem == 0) {
21  dataset.maxnumelements = 5;
22  ARB_alloc(dataset.element, dataset.maxnumelements);
23  }
24  else if (curelem == dataset.maxnumelements) {
25  dataset.maxnumelements *= 2;
26  ARB_realloc(dataset.element, dataset.maxnumelements);
27  }
28  return curelem;
29 }
30 
31 static void set_constant_fields(NA_Sequence *this_elem) {
32  this_elem->attr = DEFAULT_X_ATTR;
33  this_elem->comments = ARB_strdup("no comments");
34  this_elem->comments_maxlen = 1 + (this_elem->comments_len = strlen(this_elem->comments));
35  this_elem->rmatrix = NULp;
36  this_elem->tmatrix = NULp;
37  this_elem->col_lut = Default_PROColor_LKUP;
38 }
39 
40 static void AppendNA_and_free(NA_Sequence *this_elem, uchar *& sequfilt) {
41  AppendNA((NA_Base *)sequfilt, strlen((const char *)sequfilt), this_elem);
42  freenull(sequfilt);
43 }
44 
46  GBDATA **the_species,
47  unsigned char **the_names,
48  unsigned char **the_sequences,
49  unsigned long numberspecies,
50  unsigned long maxalignlen,
51  const AP_filter *filter,
52  GapCompression compress,
53  bool cutoff_stop_codon,
54  TypeInfo typeinfo)
55 {
56  GBDATA *gb_species;
57  NA_Sequence *this_elem;
58  AP_filter *allocatedFilter = NULp;
59 
60  gde_assert(contradicted(the_species, the_names));
61 
62  if (!filter) {
63  allocatedFilter = new AP_filter(maxalignlen);
64  filter = allocatedFilter;
65  }
66  else {
67  size_t fl = filter->get_length();
68  if (fl < maxalignlen) {
69  aw_message("Warning: Your filter is shorter than the alignment len");
70  maxalignlen = fl;
71  }
72  }
73 
74  GB_ERROR error = filter->is_invalid();
75  if (!error) {
76  size_t *seqlen = ARB_calloc<size_t>(numberspecies);
77  // sequences may have different length
78  {
79  unsigned long i;
80  for (i=0; i<numberspecies; i++) {
81  seqlen[i] = strlen((char *)the_sequences[i]);
82  }
83  }
84 
85  if (cutoff_stop_codon) {
86  unsigned long i;
87  fputs("[CUTTING STOP CODONS]\n", stdout);
88  for (i=0; i<numberspecies; i++) {
89  uchar *seq = the_sequences[i];
90  uchar *stop_codon = (uchar*)strchr((char*)seq, '*');
91  if (stop_codon) {
92  long pos = stop_codon-seq;
93  long restlen = maxalignlen-pos;
94  memset(stop_codon, '.', restlen);
95  }
96  }
97  }
98 
99  // store (compressed) sequence data in array:
100  uchar **sequfilt = ARB_calloc<uchar*>(numberspecies+1);
102 
103  if (compress==COMPRESS_ALL) { // compress all gaps and filter positions
104  long len = filter->get_filtered_length();
105  unsigned long i;
106 
107  for (i=0; i<numberspecies; i++) {
108  ARB_calloc(sequfilt[i], len+1);
109  long newcount = 0;
110  for (unsigned long col=0; (col<maxalignlen); col++) {
111  char c = the_sequences[i][col];
112  if (!c) break;
113  if (filter->use_position(col) && !GAP::is_std_gap(c)) {
114  sequfilt[i][newcount++] = c;
115  }
116  }
117  }
118  }
119  else {
120  if (compress==COMPRESS_VERTICAL_GAPS || // compress vertical gaps (and '.')
121  compress == COMPRESS_NONINFO_COLUMNS) // and additionally all columns containing no info (only N or X)
122  {
123  size_t i;
124  bool isInfo[256];
125 
126  for (i=0; i<256; i++) isInfo[i] = true;
127  isInfo[UINT('-')] = false;
128  isInfo[UINT('.')] = false;
129  if (compress == COMPRESS_NONINFO_COLUMNS) {
130  switch (alitype) {
131  case GB_AT_RNA:
132  case GB_AT_DNA:
133  isInfo[UINT('N')] = false;
134  isInfo[UINT('n')] = false;
135  break;
136  case GB_AT_AA:
137  isInfo[UINT('X')] = false;
138  isInfo[UINT('x')] = false;
139  break;
140  default:
141  gde_assert(0);
142  break;
143  }
144  }
145 
146  bool modified = false;
147  char *filterString = filter->to_string();
148 
149  for (i=0; i<maxalignlen; i++) {
150  if (filter->use_position(i)) {
151  bool wantColumn = false;
152 
153  for (size_t n=0; n<numberspecies; n++) {
154  if (i < seqlen[n]) {
155  if (isInfo[UINT(the_sequences[n][i])]) {
156  wantColumn = true; // have info -> take column
157  break;
158  }
159  }
160  }
161  if (!wantColumn) {
162  filterString[i] = '0';
163  modified = true;
164  }
165  }
166  }
167 
168  if (modified) {
169  size_t len = filter->get_length();
170 
171  delete allocatedFilter;
172  filter = allocatedFilter = new AP_filter(filterString, NULp, len);
173  }
174 
175  free(filterString);
176  }
177 
178  if (!error) error = filter->is_invalid();
179 
180  if (!error) {
181  long len = filter->get_filtered_length();
182  size_t i;
183 
184  for (i=0; i<numberspecies; i++) {
185  int c;
186  long newcount = 0;
187 
188  ARB_alloc(sequfilt[i], len+1);
189  sequfilt[i][len] = 0;
190  memset(sequfilt[i], '.', len); // Generate empty sequences
191 
192  const uchar *simplify = filter->get_simplify_table();
193  for (size_t col=0; (col<maxalignlen) && (c=the_sequences[i][col]); col++) {
194  if (filter->use_position(col)) {
195  sequfilt[i][newcount++] = simplify[c];
196  }
197  }
198  }
199  }
200  }
201  free(seqlen);
202 
203  if (!error) {
205 
206  char *str = filter->to_string();
208  free(str);
209  }
210 
211  if (!error) {
212  long number = 0;
213  int curelem;
214  int bad_names = 0;
215 
216  int elementtype = TEXT;
217  int elementtype_init = RNA;
218  switch (typeinfo) {
219  case UNKNOWN_TYPEINFO: gde_assert(0);
220  case BASIC_TYPEINFO: break;
221 
222  case DETAILED_TYPEINFO:
223  switch (alitype) {
224  case GB_AT_RNA: elementtype = RNA; break;
225  case GB_AT_DNA: elementtype = DNA; break;
226  case GB_AT_AA: elementtype = PROTEIN; break;
227  default : gde_assert(0); break;
228  }
229 
230  gde_assert(elementtype != TEXT);
231  elementtype_init = elementtype;
232  break;
233  }
234 
235  if (!error) {
236  arb_progress progress("Read data from DB", numberspecies);
237  if (the_species) {
238  for (gb_species = the_species[number]; gb_species && !error; gb_species = the_species[++number]) {
239  curelem = Arbdb_get_curelem(dataset);
240  this_elem = &(dataset.element[curelem]);
241 
242  InitNASeq(this_elem, elementtype_init);
243  this_elem->gb_species = gb_species;
244 
245 #define GET_FIELD_CONTENT(fieldname,buffer,bufsize) do { \
246  gbd = GB_entry(gb_species, fieldname); \
247  if (gbd) { \
248  const char *val = GB_read_char_pntr(gbd); \
249  strcpy_truncate(buffer, val, bufsize); \
250  } \
251  else buffer[0] = 0; \
252  } while(0)
253 
254  GBDATA *gbd;
255  GET_FIELD_CONTENT("name", this_elem->short_name, SIZE_SHORT_NAME);
256  GET_FIELD_CONTENT("author", this_elem->authority, SIZE_AUTHORITY);
257  GET_FIELD_CONTENT("full_name", this_elem->seq_name, SIZE_SEQ_NAME);
258  GET_FIELD_CONTENT("acc", this_elem->id, SIZE_ID);
259 
260  this_elem->elementtype = elementtype;
261 
262  if (AWTC_name_quality(this_elem->short_name) != 0) bad_names++;
263  AppendNA_and_free(this_elem, sequfilt[number]);
264  set_constant_fields(this_elem);
265  progress.inc_and_check_user_abort(error);
266  }
267  }
268  else { // use the_names
269  unsigned char *species_name;
270 
271  for (species_name=the_names[number]; species_name && !error; species_name=the_names[++number]) {
272  curelem = Arbdb_get_curelem(dataset);
273  this_elem = &(dataset.element[curelem]);
274 
275  InitNASeq(this_elem, elementtype_init);
276  this_elem->gb_species = NULp;
277 
278  strcpy_truncate(this_elem->short_name, (char*)species_name, SIZE_SHORT_NAME);
279  this_elem->authority[0] = 0;
280  this_elem->seq_name[0] = 0;
281  this_elem->id[0] = 0;
282  this_elem->elementtype = elementtype;
283 
284  if (AWTC_name_quality(this_elem->short_name) != 0) bad_names++;
285  AppendNA_and_free(this_elem, sequfilt[number]);
286  set_constant_fields(this_elem);
287  progress.inc_and_check_user_abort(error);
288  }
289  }
290  }
291 
292  if (!error) {
293  if (bad_names) {
294  aw_message(GBS_global_string("Problematic names found: %i\n"
295  "External program call may fail or produce invalid results.\n"
296  "You might want to use 'Species/Synchronize IDs' and read the associated help.",
297  bad_names));
298  }
299 
300  {
301  unsigned long i;
302  for (i=0; i<dataset.numelements; i++) {
303  dataset.maxlen = std::max(dataset.maxlen,
304  dataset.element[i].seqlen+dataset.element[i].offset);
305  }
306  for (i=0; i<numberspecies; i++) {
307  delete sequfilt[i];
308  }
309  free(sequfilt);
310  }
311  }
312  }
313  }
314 
315  delete allocatedFilter;
316  if (error) {
317  aw_message(error);
318  return 1; // = aborted
319  }
320  return 0;
321 
322 }
323 
324 int ReadArbdb2(NA_Alignment& dataset, AP_filter *filter, GapCompression compress, bool cutoff_stop_codon, TypeInfo typeinfo) {
325  // goes to header: __ATTR__USERESULT
326  GBDATA **the_species;
327  uchar **the_names;
328  uchar **the_sequences;
329  long maxalignlen;
330  long numberspecies = 0;
331 
332  char *error = db_access.get_sequences(the_species, the_names, the_sequences,
333  numberspecies, maxalignlen);
334 
335  gde_assert(contradicted(the_species, the_names));
336 
337  if (error) {
338  aw_message(error);
339  return 1;
340  }
341 
342  int res = InsertDatainGDE(dataset, NULp, the_names, (unsigned char **)the_sequences, numberspecies, maxalignlen, filter, compress, cutoff_stop_codon, typeinfo);
343  for (long i=0; i<numberspecies; i++) {
344  delete the_sequences[i];
345  }
346  delete the_sequences;
347  if (the_species) delete the_species;
348  else delete the_names;
349 
350  return res;
351 }
352 
353 int ReadArbdb(NA_Alignment& dataset, bool marked, AP_filter *filter, GapCompression compress, bool cutoff_stop_codon, TypeInfo typeinfo) {
354  // goes to header: __ATTR__USERESULT
355 
357  GBDATA **the_species;
358  long numberspecies = 0;
359  long missingdata = 0;
360 
361  GBDATA *gb_species;
362  if (marked) gb_species = GBT_first_marked_species_rel_species_data(gb_species_data);
363  else gb_species = GBT_first_species_rel_species_data(gb_species_data);
364 
365  while (gb_species) {
366  if (GBT_find_sequence(gb_species, dataset.alignment_name)) numberspecies++;
367  else missingdata++;
368 
369  if (marked) gb_species = GBT_next_marked_species(gb_species);
370  else gb_species = GBT_next_species(gb_species);
371  }
372 
373  if (missingdata) {
374  aw_message(GBS_global_string("Skipped %li species which did not contain data in '%s'", missingdata, dataset.alignment_name));
375  }
376 
377  ARB_calloc(the_species, numberspecies+1);
378  numberspecies = 0;
379 
380  if (marked) gb_species = GBT_first_marked_species_rel_species_data(gb_species_data);
381  else gb_species = GBT_first_species_rel_species_data(gb_species_data);
382 
383  while (gb_species) {
384  if (GBT_find_sequence(gb_species, dataset.alignment_name)) {
385  the_species[numberspecies]=gb_species;
386  numberspecies++;
387  }
388 
389  if (marked) gb_species = GBT_next_marked_species(gb_species);
390  else gb_species = GBT_next_species(gb_species);
391  }
392 
393  long maxalignlen = GBT_get_alignment_len(db_access.gb_main, dataset.alignment_name);
394  char **the_sequences; ARB_calloc(the_sequences, numberspecies+1);
395 
396  for (long i=0; the_species[i]; i++) {
397  ARB_alloc(the_sequences[i], maxalignlen+1);
398  the_sequences[i][maxalignlen] = 0;
399  memset(the_sequences[i], '.', (size_t)maxalignlen);
400  const char *data = GB_read_char_pntr(GBT_find_sequence(the_species[i], dataset.alignment_name));
401  int size = strlen(data);
402  if (size > maxalignlen) size = (int)maxalignlen;
403  strcpy_truncate(the_sequences[i], data, size+1);
404  }
405 
406  int res = InsertDatainGDE(dataset, the_species, NULp, (unsigned char **)the_sequences, numberspecies, maxalignlen, filter, compress, cutoff_stop_codon, typeinfo);
407  for (long i=0; i<numberspecies; i++) {
408  free(the_sequences[i]);
409  }
410  free(the_sequences);
411  free(the_species);
412 
413  return res;
414 }
415 
416 int getelem(NA_Sequence *a, int b) {
417  if (a->seqlen == 0) return -1;
418 
419  if (b<a->offset || (b>a->offset+a->seqlen)) {
420  switch (a->elementtype) {
421  case DNA:
422  case RNA: return 0;
423  case PROTEIN:
424  case TEXT: return '~';
425  case MASK: return '0';
426  default: return '-';
427  }
428  }
429 
430  return a->sequence[b-a->offset];
431 }
432 
433 void putelem(NA_Sequence *a, int b, NA_Base c) {
434  if (b>=(a->offset+a->seqmaxlen)) {
435  Warning("Putelem:insert beyond end of sequence space ignored");
436  }
437  else if (b >= (a->offset)) {
438  a->sequence[b-(a->offset)] = c;
439  }
440  else {
441  NA_Base *temp = ARB_calloc<NA_Base>(a->seqmaxlen + a->offset - b);
442  switch (a->elementtype) {
443  // Pad out with gap characters fron the point of insertion to the offset
444  case MASK:
445  for (int j=b; j<a->offset; j++) temp[j-b] = '0';
446  break;
447 
448  case DNA:
449  case RNA:
450  for (int j=b; j<a->offset; j++) temp[j-b] = '\0';
451  break;
452 
453  case PROTEIN:
454  for (int j=b; j<a->offset; j++) temp[j-b] = '-';
455  break;
456 
457  case TEXT:
458  default:
459  for (int j=b; j<a->offset; j++) temp[j-b] = ' ';
460  break;
461  }
462 
463  for (int j=0; j<a->seqmaxlen; j++) temp[j+a->offset-b] = a->sequence[j];
464  free(a->sequence);
465 
466  a->sequence = temp;
467  a->seqlen += (a->offset - b);
468  a->seqmaxlen += (a->offset - b);
469  a->offset = b;
470  a->sequence[0] = c;
471  }
472 }
473 
#define MASK
Definition: GDE_def.h:44
const char * GB_ERROR
Definition: arb_core.h:25
char short_name[SIZE_SHORT_NAME]
Definition: GDE_def.h:86
GBDATA * gb_main
Definition: GDE_def.h:129
int elementtype
Definition: GDE_def.h:107
GBDATA * GBT_first_marked_species_rel_species_data(GBDATA *gb_species_data)
Definition: aditem.cxx:109
unsigned int UINT
int seqmaxlen
Definition: GDE_def.h:99
char seq_name[SIZE_SEQ_NAME]
Definition: GDE_def.h:85
#define DNA
Definition: GDE_def.h:41
int ReadArbdb2(NA_Alignment &dataset, AP_filter *filter, GapCompression compress, bool cutoff_stop_codon, TypeInfo typeinfo)
#define PROTEIN
Definition: GDE_def.h:45
#define SIZE_AUTHORITY
Definition: GDE_def.h:81
int seqlen
Definition: GDE_def.h:98
GBDATA * gb_main
Definition: GDE_menu.h:90
#define DEFAULT_X_ATTR
Definition: GDE_def.h:58
char * ARB_strdup(const char *str)
Definition: arb_string.h:27
#define RNA
Definition: GDE_def.h:42
NA_Sequence * element
Definition: GDE_def.h:124
const char * GBS_global_string(const char *templat,...)
Definition: arb_msg.cxx:204
long GBT_get_alignment_len(GBDATA *gb_main, const char *aliname)
Definition: adali.cxx:706
const uchar * get_simplify_table() const
Definition: AP_filter.hxx:100
int * col_lut
Definition: GDE_def.h:102
int attr
Definition: GDE_def.h:100
int ReadArbdb(NA_Alignment &dataset, bool marked, AP_filter *filter, GapCompression compress, bool cutoff_stop_codon, TypeInfo typeinfo)
FILE * seq
Definition: rns.c:46
int * rmatrix
Definition: GDE_def.h:111
#define AWAR_GDE_EXPORT_FILTER
GBDATA * GBT_first_species_rel_species_data(GBDATA *gb_species_data)
Definition: aditem.cxx:121
void putelem(NA_Sequence *a, int b, NA_Base c)
#define TEXT
Definition: GDE_def.h:43
NA_Base * sequence
Definition: GDE_def.h:95
void strcpy_truncate(char *dest, const char *source, size_t dest_size)
Definition: GDE_def.h:137
char * to_string() const
Definition: AP_filter.cxx:146
TYPE * ARB_alloc(size_t nelem)
Definition: arb_mem.h:56
void AppendNA(NA_Base *buffer, int len, NA_Sequence *seq)
Definition: GDE_FileIO.cxx:244
GDE_get_sequences_cb get_sequences
Definition: GDE_menu.h:87
GBDATA * gb_species_data
Definition: adname.cxx:34
struct gde_database_access db_access
Definition: GDE.cxx:484
char * alignment_name
Definition: GDE_def.h:130
int AWTC_name_quality(const char *short_name)
Definition: AW_rename.cxx:692
#define GET_FIELD_CONTENT(fieldname, buffer, bufsize)
static void error(const char *msg)
Definition: mkptypes.cxx:96
#define SIZE_ID
Definition: GDE_def.h:77
size_t get_length() const
Definition: AP_filter.hxx:83
GBDATA * GBT_next_marked_species(GBDATA *gb_species)
Definition: aditem.cxx:116
GB_alignment_type GBT_get_alignment_type(GBDATA *gb_main, const char *aliname)
Definition: adali.cxx:740
#define SIZE_SHORT_NAME
Definition: GDE_def.h:79
GBDATA * GBT_find_sequence(GBDATA *gb_species, const char *aliname)
Definition: adali.cxx:670
size_t get_filtered_length() const
Definition: AP_filter.hxx:82
#define SIZE_SEQ_NAME
Definition: GDE_def.h:78
void Warning(const char *s)
Definition: GDE_FileIO.cxx:334
int comments_len
Definition: GDE_def.h:93
int maxnumelements
Definition: GDE_def.h:121
static __ATTR__USERESULT int InsertDatainGDE(NA_Alignment &dataset, GBDATA **the_species, unsigned char **the_names, unsigned char **the_sequences, unsigned long numberspecies, unsigned long maxalignlen, const AP_filter *filter, GapCompression compress, bool cutoff_stop_codon, TypeInfo typeinfo)
GB_alignment_type
Definition: arbdb_base.h:61
static void set_constant_fields(NA_Sequence *this_elem)
fputs(TRACE_PREFIX, stderr)
int comments_maxlen
Definition: GDE_def.h:93
unsigned char NA_Base
Definition: GDE_def.h:73
int offset
Definition: GDE_def.h:97
TYPE * ARB_calloc(size_t nelem)
Definition: arb_mem.h:81
TypeInfo
Definition: GDE_menu.h:42
int * tmatrix
Definition: GDE_def.h:110
int Default_PROColor_LKUP[]
Definition: GDE_global.cxx:69
GapCompression
Definition: GDE_extglob.h:13
GB_ERROR GBT_write_string(GBDATA *gb_container, const char *fieldpath, const char *content)
Definition: adtools.cxx:451
size_t numelements
Definition: GDE_def.h:120
#define __ATTR__USERESULT
Definition: attributes.h:58
void ARB_realloc(TYPE *&tgt, size_t nelem)
Definition: arb_mem.h:43
int getelem(NA_Sequence *a, int b)
static void AppendNA_and_free(NA_Sequence *this_elem, uchar *&sequfilt)
GB_ERROR is_invalid() const
Definition: AP_filter.hxx:123
GBDATA * gb_species
Definition: GDE_def.h:113
int maxlen
Definition: GDE_def.h:122
void aw_message(const char *msg)
Definition: AW_status.cxx:932
unsigned char uchar
Definition: gde.hxx:21
bool is_std_gap(const char c)
GBDATA * GBT_next_species(GBDATA *gb_species)
Definition: aditem.cxx:128
#define NULp
Definition: cxxforward.h:97
#define offset(field)
Definition: GLwDrawA.c:73
GB_transaction ta(gb_var)
GB_CSTR GB_read_char_pntr(GBDATA *gbd)
Definition: arbdb.cxx:874
bool use_position(size_t pos) const
Definition: AP_filter.hxx:85
char authority[SIZE_AUTHORITY]
Definition: GDE_def.h:91
char id[SIZE_ID]
Definition: GDE_def.h:84
#define gde_assert(bed)
Definition: GDE_def.h:12
void inc_and_check_user_abort(GB_ERROR &error)
Definition: arb_progress.h:274
void InitNASeq(NA_Sequence *seq, int type)
Definition: GDE_FileIO.cxx:339
int Arbdb_get_curelem(NA_Alignment &dataset)
char * comments
Definition: GDE_def.h:92
GBDATA * GBT_get_species_data(GBDATA *gb_main)
Definition: aditem.cxx:105
#define max(a, b)
Definition: f2c.h:154