ARB
AlignedSequenceLoader.cxx
Go to the documentation of this file.
1 /*
2  * AlignedSequenceLoader.cxx
3  *
4  * Created on: Feb 15, 2010
5  * Author: Breno Faria
6  *
7  * Institute of Microbiology (Technical University Munich)
8  * http://www.arb-home.de/
9  */
10 
11 
12 #include "AlignedSequenceLoader.h"
13 
14 #include <arbdbt.h>
15 
21 AlignedSequenceLoader::AlignedSequenceLoader(GBDATA *gb_main) :
22  error(NULp),
23  seqs(NULp)
24 {
25  error = GB_push_transaction(gb_main);
26  if (!error) {
27  char *al_name = GBT_get_default_alignment(gb_main);
28  if (!al_name) {
29  error = GB_await_error();
30  }
31  else {
32  int al_len = GBT_get_alignment_len(gb_main, al_name);
33 
34  if (al_len == -1) {
35  error = GB_await_error();
36  }
37  else {
38  seqs = new VecVecType(0);
39  MSA_len = al_len;
40 
41  size_t occurrences[MSA_len];
42  for (size_t i = 0; i < MSA_len; i++) occurrences[i] = 0;
43 
44  cout << "loading marked species: ";
45  flush(cout);
46  for (GBDATA *gb_species = GBT_first_marked_species(gb_main);
47  gb_species && !error;
48  gb_species = GBT_next_marked_species(gb_species))
49  {
50  GBDATA *gb_data = GBT_find_sequence(gb_species, al_name);
51  if (!gb_data) {
52  error = GBS_global_string("species '%s' has no data in selected alignment '%s'",
53  GBT_get_name_or_description(gb_species),
54  al_name);
55  }
56  else { // existing alignment data for this species
57  cout << GBT_get_name_or_description(gb_species) << " ";
58  flush(cout);
59 
60  string sequence = GB_read_char_pntr(gb_data);
61 
62  string *seq_as_vec = new string[MSA_len];
63  int k = 0;
64  for (string::iterator i = sequence.begin(); i != sequence.end(); ++i) {
65  switch (*i) {
66  case 'A':
67  case 'a':
68  seq_as_vec[k] = "A";
69  occurrences[k] += 1;
70  break;
71  case 'C':
72  case 'c':
73  seq_as_vec[k] = "C";
74  occurrences[k] += 1;
75  break;
76  case 'G':
77  case 'g':
78  seq_as_vec[k] = "G";
79  occurrences[k] += 1;
80  break;
81  case 'T':
82  case 't':
83  case 'U':
84  case 'u':
85  seq_as_vec[k] = "T";
86  occurrences[k] += 1;
87  break;
88  default:
89  seq_as_vec[k] = "-";
90  break;
91  }
92  k++;
93  }
94 
95  arb_assert((size_t)k == MSA_len);
96  vector<string> seq_vector(&seq_as_vec[0], &seq_as_vec[k]);
97  delete [] seq_as_vec;
98 
99  seqs->push_back(seq_vector);
100  }
101  }
102 
103  cout << "done. Total number of species: " << seqs->size() << endl;
104  flush(cout);
105 
106  cleanSeqs(occurrences, MSA_len);
107  }
108 
109  free(al_name);
110  }
111  }
112 
113  error = GB_end_transaction(gb_main, error);
114 }
115 
121 VecVecType* AlignedSequenceLoader::getSequences() {
122  arb_assert(!has_error());
123  arb_assert(seqs);
124  return seqs;
125 }
126 
130 AlignedSequenceLoader::~AlignedSequenceLoader() {
131  delete seqs;
132 }
133 
139 size_t AlignedSequenceLoader::getMsaLen() {
140  arb_assert(!has_error());
141  return MSA_len;
142 }
143 
150 const vector<size_t>& AlignedSequenceLoader::getPositionMap() {
151  arb_assert(!has_error());
152  return position_map;
153 }
154 
163 void AlignedSequenceLoader::cleanSeqs(const size_t *occurrences, long len) {
164 
165  cout << "cleaning-up sequences of empty positions... " << endl;
166  flush( cout);
167 
168  size_t num_of_bases = 0;
169  for (int i = 0; i < len; i++) {
170  if (occurrences[i] != 0) {
171  num_of_bases++;
172  }
173  }
174 
175  cout << "number of non-empty positions in MSA: " << num_of_bases
176  << ". Filtered out " << len - num_of_bases << " positions." << endl;
177 
178  VecVecType *clean_seqs = new VecVecType(0);
179 
180  cout << "computing position map..." << endl;
181  position_map.resize(num_of_bases, 0);
182 
183  int j = 0;
184  for (int i = 0; i < len; i++) {
185  if (occurrences[i] != 0) {
186  position_map.at(j) = i;
187  j++;
188  }
189  }
190 
191  for (VecVecType::iterator seq = seqs->begin(); seq != seqs->end(); ++seq) {
192  // for every sequence
193  vector<string> sequence(num_of_bases, "");
194  int jj = 0;
195  for (int i = 0; i < len; ++i) {
196  if (occurrences[i] != 0) {
197  sequence.at(jj) = seq->at(i);
198  jj++;
199  }
200  }
201  arb_assert(sequence.size() == num_of_bases);
202  clean_seqs->push_back(sequence);
203  }
204 
205  delete seqs; // before overwriting it
206  seqs = clean_seqs;
207 
208  cout << "clean-up done." << endl;
209 
210 }
211 
#define arb_assert(cond)
Definition: arb_assert.h:245
GBDATA * GBT_first_marked_species(GBDATA *gb_main)
Definition: aditem.cxx:113
GB_ERROR GB_end_transaction(GBDATA *gbd, GB_ERROR error)
Definition: arbdb.cxx:2561
const char * GBS_global_string(const char *templat,...)
Definition: arb_msg.cxx:203
long GBT_get_alignment_len(GBDATA *gb_main, const char *aliname)
Definition: adali.cxx:833
GB_ERROR GB_push_transaction(GBDATA *gbd)
Definition: arbdb.cxx:2494
FILE * seq
Definition: rns.c:46
GB_ERROR GB_await_error()
Definition: arb_msg.cxx:342
static void error(const char *msg)
Definition: mkptypes.cxx:96
GBDATA * GBT_next_marked_species(GBDATA *gb_species)
Definition: aditem.cxx:116
vector< vector< string > > VecVecType
Definition: Cma.h:60
GBDATA * GBT_find_sequence(GBDATA *gb_species, const char *aliname)
Definition: adali.cxx:708
#define NULp
Definition: cxxforward.h:114
char * GBT_get_default_alignment(GBDATA *gb_main)
Definition: adali.cxx:747
GB_CSTR GB_read_char_pntr(GBDATA *gbd)
Definition: arbdb.cxx:904
GBDATA * gb_main
Definition: adname.cxx:32
GB_CSTR GBT_get_name_or_description(GBDATA *gb_item)
Definition: aditem.cxx:459
void free() const