ARB
GDE_arbdb_io.cxx
Go to the documentation of this file.
1 #include "GDE_proto.h"
2 
3 #include <aw_msg.hxx>
4 #include <AW_rename.hxx>
5 #include <AP_filter.hxx>
6 #include <aw_awar_defs.hxx>
7 #include <arb_progress.h>
8 #include <arb_global_defs.h>
9 
10 #include <algorithm>
11 
12 // AISC_MKPT_PROMOTE:#ifndef GDE_EXTGLOB_H
13 // AISC_MKPT_PROMOTE:#include "GDE_extglob.h"
14 // AISC_MKPT_PROMOTE:#endif
15 
16 typedef unsigned int UINT;
17 
19  int curelem = dataset.numelements++;
20  if (curelem == 0) {
21  dataset.maxnumelements = 5;
22  ARB_alloc(dataset.element, dataset.maxnumelements);
23  }
24  else if (curelem == dataset.maxnumelements) {
25  dataset.maxnumelements *= 2;
26  ARB_realloc(dataset.element, dataset.maxnumelements);
27  }
28  return curelem;
29 }
30 
31 static void set_constant_fields(NA_Sequence *this_elem) {
32  this_elem->attr = DEFAULT_X_ATTR;
33  this_elem->comments = ARB_strdup("no comments");
34  this_elem->comments_maxlen = 1 + (this_elem->comments_len = strlen(this_elem->comments));
35  this_elem->rmatrix = NULp;
36  this_elem->tmatrix = NULp;
37  this_elem->col_lut = Default_PROColor_LKUP;
38 }
39 
40 static void AppendNA_and_free(NA_Sequence *this_elem, uchar *& sequfilt) {
41  AppendNA((NA_Base *)sequfilt, strlen((const char *)sequfilt), this_elem);
42  freenull(sequfilt);
43 }
44 
46  GBDATA **the_species,
47  unsigned char **the_names,
48  unsigned char **the_sequences,
49  unsigned long numberspecies,
50  unsigned long maxalignlen,
51  const AP_filter *filter,
52  GapCompression compress,
53  bool cutoff_stop_codon,
54  TypeInfo typeinfo)
55 {
56  GBDATA *gb_species;
57  NA_Sequence *this_elem;
58  AP_filter *allocatedFilter = NULp;
59 
60  gde_assert(contradicted(the_species, the_names));
61 
62  if (!filter) {
63  allocatedFilter = new AP_filter(maxalignlen);
64  filter = allocatedFilter;
65  }
66  else {
67  size_t fl = filter->get_length();
68  if (fl < maxalignlen) {
69  aw_message("Warning: Your filter is shorter than the alignment len");
70  maxalignlen = fl;
71  }
72  }
73 
74  GB_ERROR error = filter->is_invalid();
75  if (!error) {
76  size_t *seqlen = ARB_calloc<size_t>(numberspecies);
77  // sequences may have different length
78  {
79  unsigned long i;
80  for (i=0; i<numberspecies; i++) {
81  seqlen[i] = strlen((char *)the_sequences[i]);
82  }
83  }
84 
85  if (cutoff_stop_codon) {
86  unsigned long i;
87  fputs("[CUTTING STOP CODONS]\n", stdout);
88  for (i=0; i<numberspecies; i++) {
89  uchar *seq = the_sequences[i];
90  uchar *stop_codon = (uchar*)strchr((char*)seq, '*');
91  if (stop_codon) {
92  long pos = stop_codon-seq;
93  long restlen = maxalignlen-pos;
94  memset(stop_codon, '.', restlen);
95  }
96  }
97  }
98 
99  // store (compressed) sequence data in array:
100  uchar **sequfilt = ARB_calloc<uchar*>(numberspecies+1);
102  gde_assert(alitype != GB_AT_UNKNOWN);
103 
104  if (compress==COMPRESS_ALL) { // compress all gaps and filter positions
105  long len = filter->get_filtered_length();
106  unsigned long i;
107 
108  for (i=0; i<numberspecies; i++) {
109  ARB_calloc(sequfilt[i], len+1);
110  long newcount = 0;
111  for (unsigned long col=0; (col<maxalignlen); col++) {
112  char c = the_sequences[i][col];
113  if (!c) break;
114  if (filter->use_position(col) && !GAP::is_std_gap(c)) {
115  sequfilt[i][newcount++] = c;
116  }
117  }
118  }
119  }
120  else {
121  if (compress==COMPRESS_VERTICAL_GAPS || // compress vertical gaps (and '.')
122  compress == COMPRESS_NONINFO_COLUMNS) // and additionally all columns containing no info (only N or X)
123  {
124  size_t i;
125  bool isInfo[256];
126 
127  for (i=0; i<256; i++) isInfo[i] = true;
128  isInfo[UINT('-')] = false;
129  isInfo[UINT('.')] = false;
130  if (compress == COMPRESS_NONINFO_COLUMNS) {
131  switch (alitype) {
132  case GB_AT_RNA:
133  case GB_AT_DNA:
134  isInfo[UINT('N')] = false;
135  isInfo[UINT('n')] = false;
136  break;
137  case GB_AT_AA:
138  isInfo[UINT('X')] = false;
139  isInfo[UINT('x')] = false;
140  break;
141  default:
142  gde_assert(0);
143  break;
144  }
145  }
146 
147  bool modified = false;
148  char *filterString = filter->to_string();
149 
150  for (i=0; i<maxalignlen; i++) {
151  if (filter->use_position(i)) {
152  bool wantColumn = false;
153 
154  for (size_t n=0; n<numberspecies; n++) {
155  if (i < seqlen[n]) {
156  if (isInfo[UINT(the_sequences[n][i])]) {
157  wantColumn = true; // have info -> take column
158  break;
159  }
160  }
161  }
162  if (!wantColumn) {
163  filterString[i] = '0';
164  modified = true;
165  }
166  }
167  }
168 
169  if (modified) {
170  size_t len = filter->get_length();
171 
172  delete allocatedFilter;
173  filter = allocatedFilter = new AP_filter(filterString, NULp, len);
174  }
175 
176  free(filterString);
177  }
178 
179  if (!error) error = filter->is_invalid();
180 
181  if (!error) {
182  long len = filter->get_filtered_length();
183  size_t i;
184 
185  for (i=0; i<numberspecies; i++) {
186  int c;
187  long newcount = 0;
188 
189  ARB_alloc(sequfilt[i], len+1);
190  sequfilt[i][len] = 0;
191  memset(sequfilt[i], '.', len); // Generate empty sequences
192 
193  const uchar *simplify = filter->get_simplify_table();
194  for (size_t col=0; (col<maxalignlen) && (c=the_sequences[i][col]); col++) {
195  if (filter->use_position(col)) {
196  sequfilt[i][newcount++] = simplify[c];
197  }
198  }
199  }
200  }
201  }
202  free(seqlen);
203 
204  if (!error) {
206 
207  char *str = filter->to_string();
209  free(str);
210  }
211 
212  if (!error) {
213  long number = 0;
214  int curelem;
215  int bad_names = 0;
216 
217  int elementtype = TEXT;
218  int elementtype_init = RNA;
219  switch (typeinfo) {
220  case UNKNOWN_TYPEINFO: gde_assert(0);
221  case BASIC_TYPEINFO: break;
222 
223  case DETAILED_TYPEINFO:
224  switch (alitype) {
225  case GB_AT_RNA: elementtype = RNA; break;
226  case GB_AT_DNA: elementtype = DNA; break;
227  case GB_AT_AA: elementtype = PROTEIN; break;
228  default : gde_assert(0); break;
229  }
230 
231  gde_assert(elementtype != TEXT);
232  elementtype_init = elementtype;
233  break;
234  }
235 
236  if (!error) {
237  arb_progress progress("Read data from DB", numberspecies);
238  if (the_species) {
239  for (gb_species = the_species[number]; gb_species && !error; gb_species = the_species[++number]) {
240  curelem = Arbdb_get_curelem(dataset);
241  this_elem = &(dataset.element[curelem]);
242 
243  InitNASeq(this_elem, elementtype_init);
244  this_elem->gb_species = gb_species;
245 
246 #define GET_FIELD_CONTENT(fieldname,buffer,bufsize) do { \
247  gbd = GB_entry(gb_species, fieldname); \
248  if (gbd) { \
249  const char *val = GB_read_char_pntr(gbd); \
250  strcpy_truncate(buffer, val, bufsize); \
251  } \
252  else buffer[0] = 0; \
253  } while(0)
254 
255  GBDATA *gbd;
256  GET_FIELD_CONTENT("name", this_elem->short_name, SIZE_SHORT_NAME);
257  GET_FIELD_CONTENT("author", this_elem->authority, SIZE_AUTHORITY);
258  GET_FIELD_CONTENT("full_name", this_elem->seq_name, SIZE_SEQ_NAME);
259  GET_FIELD_CONTENT("acc", this_elem->id, SIZE_ID);
260 
261  this_elem->elementtype = elementtype;
262 
263  if (AWTC_name_quality(this_elem->short_name) != 0) bad_names++;
264  AppendNA_and_free(this_elem, sequfilt[number]);
265  set_constant_fields(this_elem);
266  progress.inc_and_check_user_abort(error);
267  }
268  }
269  else { // use the_names
270  unsigned char *species_name;
271 
272  for (species_name=the_names[number]; species_name && !error; species_name=the_names[++number]) {
273  curelem = Arbdb_get_curelem(dataset);
274  this_elem = &(dataset.element[curelem]);
275 
276  InitNASeq(this_elem, elementtype_init);
277  this_elem->gb_species = NULp;
278 
279  strcpy_truncate(this_elem->short_name, (char*)species_name, SIZE_SHORT_NAME);
280  this_elem->authority[0] = 0;
281  this_elem->seq_name[0] = 0;
282  this_elem->id[0] = 0;
283  this_elem->elementtype = elementtype;
284 
285  if (AWTC_name_quality(this_elem->short_name) != 0) bad_names++;
286  AppendNA_and_free(this_elem, sequfilt[number]);
287  set_constant_fields(this_elem);
288  progress.inc_and_check_user_abort(error);
289  }
290  }
291  }
292 
293  if (!error) {
294  if (bad_names) {
295  aw_message(GBS_global_string("Problematic names found: %i\n"
296  "External program call may fail or produce invalid results.\n"
297  "You might want to use 'Species/Synchronize IDs' and read the associated help.",
298  bad_names));
299  }
300 
301  {
302  unsigned long i;
303  for (i=0; i<dataset.numelements; i++) {
304  dataset.maxlen = std::max(dataset.maxlen,
305  dataset.element[i].seqlen+dataset.element[i].offset);
306  }
307  for (i=0; i<numberspecies; i++) {
308  delete sequfilt[i];
309  }
310  free(sequfilt);
311  }
312  }
313  }
314  }
315 
316  delete allocatedFilter;
317  if (error) {
318  aw_message(error);
319  return 1; // = aborted
320  }
321  return 0;
322 
323 }
324 
325 int ReadArbdb2(NA_Alignment& dataset, AP_filter *filter, GapCompression compress, bool cutoff_stop_codon, TypeInfo typeinfo) {
326  // goes to header: __ATTR__USERESULT
327  GBDATA **the_species;
328  uchar **the_names;
329  uchar **the_sequences;
330  long maxalignlen;
331  long numberspecies = 0;
332 
333  char *error = db_access.get_sequences(the_species, the_names, the_sequences,
334  numberspecies, maxalignlen);
335 
336  gde_assert(contradicted(the_species, the_names));
337 
338  if (error) {
339  aw_message(error);
340  return 1;
341  }
342 
343  int res = InsertDatainGDE(dataset, NULp, the_names, (unsigned char **)the_sequences, numberspecies, maxalignlen, filter, compress, cutoff_stop_codon, typeinfo);
344  for (long i=0; i<numberspecies; i++) {
345  delete the_sequences[i];
346  }
347  delete the_sequences;
348  if (the_species) delete the_species;
349  else delete the_names;
350 
351  return res;
352 }
353 
354 int ReadArbdb(NA_Alignment& dataset, bool marked, AP_filter *filter, GapCompression compress, bool cutoff_stop_codon, TypeInfo typeinfo) {
355  // goes to header: __ATTR__USERESULT
356 
358  GBDATA **the_species;
359  long numberspecies = 0;
360  long missingdata = 0;
361 
362  GBDATA *gb_species;
363  if (marked) gb_species = GBT_first_marked_species_rel_species_data(gb_species_data);
364  else gb_species = GBT_first_species_rel_species_data(gb_species_data);
365 
366  while (gb_species) {
367  if (GBT_find_sequence(gb_species, dataset.alignment_name)) numberspecies++;
368  else missingdata++;
369 
370  if (marked) gb_species = GBT_next_marked_species(gb_species);
371  else gb_species = GBT_next_species(gb_species);
372  }
373 
374  if (missingdata) {
375  aw_message(GBS_global_string("Skipped %li species which did not contain data in '%s'", missingdata, dataset.alignment_name));
376  }
377 
378  ARB_calloc(the_species, numberspecies+1);
379  numberspecies = 0;
380 
381  if (marked) gb_species = GBT_first_marked_species_rel_species_data(gb_species_data);
382  else gb_species = GBT_first_species_rel_species_data(gb_species_data);
383 
384  while (gb_species) {
385  if (GBT_find_sequence(gb_species, dataset.alignment_name)) {
386  the_species[numberspecies]=gb_species;
387  numberspecies++;
388  }
389 
390  if (marked) gb_species = GBT_next_marked_species(gb_species);
391  else gb_species = GBT_next_species(gb_species);
392  }
393 
394  long maxalignlen = GBT_get_alignment_len(db_access.gb_main, dataset.alignment_name);
395  gde_assert(maxalignlen != -1);
396 
397  char **the_sequences; ARB_calloc(the_sequences, numberspecies+1);
398 
399  for (long i=0; the_species[i]; i++) {
400  ARB_alloc(the_sequences[i], maxalignlen+1);
401  the_sequences[i][maxalignlen] = 0;
402  memset(the_sequences[i], '.', (size_t)maxalignlen);
403  const char *data = GB_read_char_pntr(GBT_find_sequence(the_species[i], dataset.alignment_name));
404  int size = strlen(data);
405  if (size > maxalignlen) size = (int)maxalignlen;
406  strcpy_truncate(the_sequences[i], data, size+1);
407  }
408 
409  int res = InsertDatainGDE(dataset, the_species, NULp, (unsigned char **)the_sequences, numberspecies, maxalignlen, filter, compress, cutoff_stop_codon, typeinfo);
410  for (long i=0; i<numberspecies; i++) {
411  free(the_sequences[i]);
412  }
413  free(the_sequences);
414  free(the_species);
415 
416  return res;
417 }
418 
419 int getelem(NA_Sequence *a, int b) {
420  if (a->seqlen == 0) return -1;
421 
422  if (b<a->offset || (b>a->offset+a->seqlen)) {
423  switch (a->elementtype) {
424  case DNA:
425  case RNA: return 0;
426  case PROTEIN:
427  case TEXT: return '~';
428  case MASK: return '0';
429  default: return '-';
430  }
431  }
432 
433  return a->sequence[b-a->offset];
434 }
435 
436 void putelem(NA_Sequence *a, int b, NA_Base c) {
437  if (b>=(a->offset+a->seqmaxlen)) {
438  Warning("Putelem:insert beyond end of sequence space ignored");
439  }
440  else if (b >= (a->offset)) {
441  a->sequence[b-(a->offset)] = c;
442  }
443  else {
444  NA_Base *temp = ARB_calloc<NA_Base>(a->seqmaxlen + a->offset - b);
445  switch (a->elementtype) {
446  // Pad out with gap characters fron the point of insertion to the offset
447  case MASK:
448  for (int j=b; j<a->offset; j++) temp[j-b] = '0';
449  break;
450 
451  case DNA:
452  case RNA:
453  for (int j=b; j<a->offset; j++) temp[j-b] = '\0';
454  break;
455 
456  case PROTEIN:
457  for (int j=b; j<a->offset; j++) temp[j-b] = '-';
458  break;
459 
460  case TEXT:
461  default:
462  for (int j=b; j<a->offset; j++) temp[j-b] = ' ';
463  break;
464  }
465 
466  for (int j=0; j<a->seqmaxlen; j++) temp[j+a->offset-b] = a->sequence[j];
467  free(a->sequence);
468 
469  a->sequence = temp;
470  a->seqlen += (a->offset - b);
471  a->seqmaxlen += (a->offset - b);
472  a->offset = b;
473  a->sequence[0] = c;
474  }
475 }
476 
#define MASK
Definition: GDE_def.h:44
const char * GB_ERROR
Definition: arb_core.h:25
char short_name[SIZE_SHORT_NAME]
Definition: GDE_def.h:86
GBDATA * gb_main
Definition: GDE_def.h:129
int elementtype
Definition: GDE_def.h:107
GBDATA * GBT_first_marked_species_rel_species_data(GBDATA *gb_species_data)
Definition: aditem.cxx:109
unsigned int UINT
int seqmaxlen
Definition: GDE_def.h:99
char seq_name[SIZE_SEQ_NAME]
Definition: GDE_def.h:85
#define DNA
Definition: GDE_def.h:41
int ReadArbdb2(NA_Alignment &dataset, AP_filter *filter, GapCompression compress, bool cutoff_stop_codon, TypeInfo typeinfo)
#define PROTEIN
Definition: GDE_def.h:45
#define SIZE_AUTHORITY
Definition: GDE_def.h:81
int seqlen
Definition: GDE_def.h:98
GBDATA * gb_main
Definition: GDE_menu.h:90
#define DEFAULT_X_ATTR
Definition: GDE_def.h:58
char * ARB_strdup(const char *str)
Definition: arb_string.h:27
#define RNA
Definition: GDE_def.h:42
NA_Sequence * element
Definition: GDE_def.h:124
const char * GBS_global_string(const char *templat,...)
Definition: arb_msg.cxx:202
long GBT_get_alignment_len(GBDATA *gb_main, const char *aliname)
Definition: adali.cxx:833
const uchar * get_simplify_table() const
Definition: AP_filter.hxx:100
int * col_lut
Definition: GDE_def.h:102
int attr
Definition: GDE_def.h:100
int ReadArbdb(NA_Alignment &dataset, bool marked, AP_filter *filter, GapCompression compress, bool cutoff_stop_codon, TypeInfo typeinfo)
FILE * seq
Definition: rns.c:46
int * rmatrix
Definition: GDE_def.h:111
#define AWAR_GDE_EXPORT_FILTER
GBDATA * GBT_first_species_rel_species_data(GBDATA *gb_species_data)
Definition: aditem.cxx:121
void putelem(NA_Sequence *a, int b, NA_Base c)
#define TEXT
Definition: GDE_def.h:43
NA_Base * sequence
Definition: GDE_def.h:95
void strcpy_truncate(char *dest, const char *source, size_t dest_size)
Definition: GDE_def.h:137
char * to_string() const
Definition: AP_filter.cxx:146
TYPE * ARB_alloc(size_t nelem)
Definition: arb_mem.h:56
void AppendNA(NA_Base *buffer, int len, NA_Sequence *seq)
Definition: GDE_FileIO.cxx:244
GDE_get_sequences_cb get_sequences
Definition: GDE_menu.h:87
GBDATA * gb_species_data
Definition: adname.cxx:34
struct gde_database_access db_access
Definition: GDE.cxx:487
char * alignment_name
Definition: GDE_def.h:130
int AWTC_name_quality(const char *short_name)
Definition: AW_rename.cxx:707
#define GET_FIELD_CONTENT(fieldname, buffer, bufsize)
static void error(const char *msg)
Definition: mkptypes.cxx:96
#define SIZE_ID
Definition: GDE_def.h:77
size_t get_length() const
Definition: AP_filter.hxx:83
GBDATA * GBT_next_marked_species(GBDATA *gb_species)
Definition: aditem.cxx:116
GB_alignment_type GBT_get_alignment_type(GBDATA *gb_main, const char *aliname)
Definition: adali.cxx:868
#define SIZE_SHORT_NAME
Definition: GDE_def.h:79
GBDATA * GBT_find_sequence(GBDATA *gb_species, const char *aliname)
Definition: adali.cxx:708
size_t get_filtered_length() const
Definition: AP_filter.hxx:82
#define SIZE_SEQ_NAME
Definition: GDE_def.h:78
void Warning(const char *s)
Definition: GDE_FileIO.cxx:334
int comments_len
Definition: GDE_def.h:93
int maxnumelements
Definition: GDE_def.h:121
static __ATTR__USERESULT int InsertDatainGDE(NA_Alignment &dataset, GBDATA **the_species, unsigned char **the_names, unsigned char **the_sequences, unsigned long numberspecies, unsigned long maxalignlen, const AP_filter *filter, GapCompression compress, bool cutoff_stop_codon, TypeInfo typeinfo)
GB_alignment_type
Definition: arbdb_base.h:61
static void set_constant_fields(NA_Sequence *this_elem)
fputs(TRACE_PREFIX, stderr)
int comments_maxlen
Definition: GDE_def.h:93
unsigned char NA_Base
Definition: GDE_def.h:73
int offset
Definition: GDE_def.h:97
TYPE * ARB_calloc(size_t nelem)
Definition: arb_mem.h:81
TypeInfo
Definition: GDE_menu.h:42
int * tmatrix
Definition: GDE_def.h:110
int Default_PROColor_LKUP[]
Definition: GDE_global.cxx:69
GapCompression
Definition: GDE_extglob.h:13
GB_ERROR GBT_write_string(GBDATA *gb_container, const char *fieldpath, const char *content)
Definition: adtools.cxx:451
size_t numelements
Definition: GDE_def.h:120
#define __ATTR__USERESULT
Definition: attributes.h:58
void ARB_realloc(TYPE *&tgt, size_t nelem)
Definition: arb_mem.h:43
int getelem(NA_Sequence *a, int b)
static void AppendNA_and_free(NA_Sequence *this_elem, uchar *&sequfilt)
GB_ERROR is_invalid() const
Definition: AP_filter.hxx:123
GBDATA * gb_species
Definition: GDE_def.h:113
int maxlen
Definition: GDE_def.h:122
void aw_message(const char *msg)
Definition: AW_status.cxx:1142
unsigned char uchar
Definition: gde.hxx:21
bool is_std_gap(const char c)
GBDATA * GBT_next_species(GBDATA *gb_species)
Definition: aditem.cxx:128
#define NULp
Definition: cxxforward.h:114
#define offset(field)
Definition: GLwDrawA.c:73
GB_transaction ta(gb_var)
GB_CSTR GB_read_char_pntr(GBDATA *gbd)
Definition: arbdb.cxx:904
bool use_position(size_t pos) const
Definition: AP_filter.hxx:85
char authority[SIZE_AUTHORITY]
Definition: GDE_def.h:91
char id[SIZE_ID]
Definition: GDE_def.h:84
#define gde_assert(bed)
Definition: GDE_def.h:12
void inc_and_check_user_abort(GB_ERROR &error)
Definition: arb_progress.h:332
void InitNASeq(NA_Sequence *seq, int type)
Definition: GDE_FileIO.cxx:339
int Arbdb_get_curelem(NA_Alignment &dataset)
char * comments
Definition: GDE_def.h:92
GBDATA * GBT_get_species_data(GBDATA *gb_main)
Definition: aditem.cxx:105
#define max(a, b)
Definition: f2c.h:154