ARB
AlignedSequenceLoader.cxx
Go to the documentation of this file.
1 /*
2  * AlignedSequenceLoader.cxx
3  *
4  * Created on: Feb 15, 2010
5  * Author: Breno Faria
6  *
7  * Institute of Microbiology (Technical University Munich)
8  * http://www.arb-home.de/
9  */
10 
11 
12 #include "AlignedSequenceLoader.h"
13 
14 #include <arbdbt.h>
15 
21 AlignedSequenceLoader::AlignedSequenceLoader(GBDATA *gb_main) :
22  error(NULp),
23  seqs(NULp)
24 {
25  error = GB_push_transaction(gb_main);
26  if (!error) {
27  char *al_name = GBT_get_default_alignment(gb_main);
28  int al_len = GBT_get_alignment_len(gb_main, al_name);
29 
30  if (al_len == -1) {
32  }
33 
34  if (!error) {
35  seqs = new VecVecType(0);
36  MSA_len = al_len;
37 
38  size_t occurrences[MSA_len];
39  for (size_t i = 0; i < MSA_len; i++) occurrences[i] = 0;
40 
41  cout << "loading marked species: ";
42  flush(cout);
43  for (GBDATA *gb_species = GBT_first_marked_species(gb_main);
44  gb_species && !error;
45  gb_species = GBT_next_marked_species(gb_species))
46  {
47  GBDATA *gb_data = GBT_find_sequence(gb_species, al_name);
48  if (gb_data) { // existing alignment data for this species
49  cout << GBT_get_name_or_description(gb_species) << " ";
50  flush(cout);
51 
52  string sequence = GB_read_char_pntr(gb_data);
53 
54  string *seq_as_vec = new string[MSA_len];
55  int k = 0;
56  for (string::iterator i = sequence.begin(); i != sequence.end(); ++i) {
57  switch (*i) {
58  case 'A':
59  case 'a':
60  seq_as_vec[k] = "A";
61  occurrences[k] += 1;
62  break;
63  case 'C':
64  case 'c':
65  seq_as_vec[k] = "C";
66  occurrences[k] += 1;
67  break;
68  case 'G':
69  case 'g':
70  seq_as_vec[k] = "G";
71  occurrences[k] += 1;
72  break;
73  case 'T':
74  case 't':
75  case 'U':
76  case 'u':
77  seq_as_vec[k] = "T";
78  occurrences[k] += 1;
79  break;
80  default:
81  seq_as_vec[k] = "-";
82  break;
83  }
84  k++;
85  }
86 
87  arb_assert((size_t)k == MSA_len);
88  vector<string> seq_vector(&seq_as_vec[0], &seq_as_vec[k]);
89  delete [] seq_as_vec;
90 
91  seqs->push_back(seq_vector);
92  }
93  else {
94  error = GBS_global_string("species '%s' has no data in selected alignment '%s'",
95  GBT_get_name_or_description(gb_species),
96  al_name);
97  }
98  }
99 
100  cout << "done. Total number of species: " << seqs->size() << endl;
101  flush(cout);
102 
103  cleanSeqs(occurrences, MSA_len);
104  }
105 
106  free(al_name);
107  }
108 
109  error = GB_end_transaction(gb_main, error);
110 }
111 
117 VecVecType* AlignedSequenceLoader::getSequences() {
118  arb_assert(!has_error());
119  arb_assert(seqs);
120  return seqs;
121 }
122 
126 AlignedSequenceLoader::~AlignedSequenceLoader() {
127  delete seqs;
128 }
129 
135 size_t AlignedSequenceLoader::getMsaLen() {
136  arb_assert(!has_error());
137  return MSA_len;
138 }
139 
146 const vector<size_t>& AlignedSequenceLoader::getPositionMap() {
147  arb_assert(!has_error());
148  return position_map;
149 }
150 
159 void AlignedSequenceLoader::cleanSeqs(const size_t *occurrences, long len) {
160 
161  cout << "cleaning-up sequences of empty positions... " << endl;
162  flush( cout);
163 
164  size_t num_of_bases = 0;
165  for (int i = 0; i < len; i++) {
166  if (occurrences[i] != 0) {
167  num_of_bases++;
168  }
169  }
170 
171  cout << "number of non-empty positions in MSA: " << num_of_bases
172  << ". Filtered out " << len - num_of_bases << " positions." << endl;
173 
174  VecVecType *clean_seqs = new VecVecType(0);
175 
176  cout << "computing position map..." << endl;
177  position_map.resize(num_of_bases, 0);
178 
179  int j = 0;
180  for (int i = 0; i < len; i++) {
181  if (occurrences[i] != 0) {
182  position_map.at(j) = i;
183  j++;
184  }
185  }
186 
187  for (VecVecType::iterator seq = seqs->begin(); seq != seqs->end(); ++seq) {
188  // for every sequence
189  vector<string> sequence(num_of_bases, "");
190  int jj = 0;
191  for (int i = 0; i < len; ++i) {
192  if (occurrences[i] != 0) {
193  sequence.at(jj) = seq->at(i);
194  jj++;
195  }
196  }
197  arb_assert(sequence.size() == num_of_bases);
198  clean_seqs->push_back(sequence);
199  }
200 
201  delete seqs; // before overwriting it
202  seqs = clean_seqs;
203 
204  cout << "clean-up done." << endl;
205 
206 }
207 
#define arb_assert(cond)
Definition: arb_assert.h:245
GBDATA * GBT_first_marked_species(GBDATA *gb_main)
Definition: aditem.cxx:113
GB_ERROR GB_end_transaction(GBDATA *gbd, GB_ERROR error)
Definition: arbdb.cxx:2549
const char * GBS_global_string(const char *templat,...)
Definition: arb_msg.cxx:204
long GBT_get_alignment_len(GBDATA *gb_main, const char *aliname)
Definition: adali.cxx:706
GB_ERROR GB_push_transaction(GBDATA *gbd)
Definition: arbdb.cxx:2482
FILE * seq
Definition: rns.c:46
GB_ERROR GB_await_error()
Definition: arb_msg.cxx:353
static void error(const char *msg)
Definition: mkptypes.cxx:96
GBDATA * GBT_next_marked_species(GBDATA *gb_species)
Definition: aditem.cxx:116
vector< vector< string > > VecVecType
Definition: Cma.h:60
GBDATA * GBT_find_sequence(GBDATA *gb_species, const char *aliname)
Definition: adali.cxx:670
#define NULp
Definition: cxxforward.h:97
char * GBT_get_default_alignment(GBDATA *gb_main)
Definition: adali.cxx:675
GB_CSTR GB_read_char_pntr(GBDATA *gbd)
Definition: arbdb.cxx:898
GBDATA * gb_main
Definition: adname.cxx:33
GB_CSTR GBT_get_name_or_description(GBDATA *gb_item)
Definition: aditem.cxx:441