ARB
AlignedSequenceLoader.cxx
Go to the documentation of this file.
1 /*
2  * AlignedSequenceLoader.cxx
3  *
4  * Created on: Feb 15, 2010
5  * Author: Breno Faria
6  *
7  * Institute of Microbiology (Technical University Munich)
8  * http://www.arb-home.de/
9  */
10 
11 
12 #include "AlignedSequenceLoader.h"
13 
14 #include <arbdb.h>
15 #include <arbdbt.h>
16 #include "dbconn.h"
17 
23 AlignedSequenceLoader::AlignedSequenceLoader() {
24 
26 
28  if (error) {
29  cout << "RNACMA-Error: " << error << "\n";
30  exit(EXIT_FAILURE);
31  }
32 
33  seqs = new VecVecType(0);
34 
35  char *al_name = GBT_get_default_alignment(gb_main);
36  MSA_len = GBT_get_alignment_len(gb_main, al_name);
37 
38  size_t occurrences[MSA_len];
39  for (size_t i = 0; i < MSA_len; i++)
40  occurrences[i] = 0;
41 
42  cout << "loading marked species: ";
43  flush( cout);
44  for (GBDATA *gb_species = GBT_first_marked_species(gb_main); gb_species; gb_species
45  = GBT_next_marked_species(gb_species)) {
46 
47  GBDATA *gb_ali = GB_entry(gb_species, al_name);
48 
49  if (gb_ali) { // existing alignment for this species
50  GBDATA *gb_data = GB_entry(gb_ali, "data");
51 
52  if (gb_data) {
53 
54  cout << GBT_get_name_or_description(gb_species) << " ";
55  flush(cout);
56 
57  string sequence = GB_read_string(gb_data);
58 
59  string *seq_as_vec = new string[MSA_len];
60  int k = 0;
61  for (string::iterator i = sequence.begin(); i != sequence.end(); ++i) {
62  switch (*i) {
63  case 'A':
64  case 'a':
65  seq_as_vec[k] = "A";
66  occurrences[k] += 1;
67  break;
68  case 'C':
69  case 'c':
70  seq_as_vec[k] = "C";
71  occurrences[k] += 1;
72  break;
73  case 'G':
74  case 'g':
75  seq_as_vec[k] = "G";
76  occurrences[k] += 1;
77  break;
78  case 'T':
79  case 't':
80  case 'U':
81  case 'u':
82  seq_as_vec[k] = "T";
83  occurrences[k] += 1;
84  break;
85  default:
86  seq_as_vec[k] = "-";
87  break;
88  }
89  k++;
90  }
91 
92  assert((size_t)k == MSA_len);
93  vector<string> seq_vector(&seq_as_vec[0], &seq_as_vec[k]);
94  delete [] seq_as_vec;
95 
96  seqs->push_back(seq_vector);
97  }
98  }
99 
100  }
101 
102  GB_pop_transaction(gb_main);
103 
104  cout << "done. Total number of species: " << seqs->size() << endl;
105  flush(cout);
106 
107  cleanSeqs(occurrences, MSA_len);
108 }
109 
115 VecVecType* AlignedSequenceLoader::getSequences() {
116  return seqs;
117 }
118 
122 AlignedSequenceLoader::~AlignedSequenceLoader() {
123  delete seqs;
124 }
125 
131 size_t AlignedSequenceLoader::getMsaLen() {
132  return MSA_len;
133 }
134 
141 vector<size_t> * AlignedSequenceLoader::getPositionMap() {
142  return position_map;
143 }
144 
153 void AlignedSequenceLoader::cleanSeqs(size_t *occurrences, long len) {
154 
155  cout << "cleaning-up sequences of empty positions... " << endl;
156  flush( cout);
157 
158  size_t num_of_bases = 0;
159  for (int i = 0; i < len; i++) {
160  if (occurrences[i] != 0) {
161  num_of_bases++;
162  }
163  }
164 
165  cout << "number of non-empty positions in MSA: " << num_of_bases
166  << ". Filtered out " << len - num_of_bases << " positions." << endl;
167 
168  VecVecType *clean_seqs = new VecVecType(0);
169 
170  cout << "computing position map..." << endl;
171  position_map = new vector<size_t> (num_of_bases, 0);
172  int j = 0;
173  for (int i = 0; i < len; i++) {
174  if (occurrences[i] != 0) {
175  position_map->at(j) = i;
176  j++;
177  }
178  }
179 
180  for (VecVecType::iterator seq = seqs->begin(); seq != seqs->end(); ++seq) {
181  //for every sequence
182  vector<string> sequence(num_of_bases, "");
183  int jj = 0;
184  for (int i = 0; i < len; ++i) {
185  if (occurrences[i] != 0) {
186  sequence.at(jj) = seq->at(i);
187  jj++;
188  }
189  }
190  assert(sequence.size() == num_of_bases);
191  clean_seqs->push_back(sequence);
192  }
193 
194  seqs = clean_seqs;
195 
196  cout << "clean-up done." << endl;
197 
198 }
199 
GBDATA * GBT_first_marked_species(GBDATA *gb_main)
Definition: aditem.cxx:113
long GBT_get_alignment_len(GBDATA *gb_main, const char *aliname)
Definition: adali.cxx:706
GB_ERROR GB_push_transaction(GBDATA *gbd)
Definition: arbdb.cxx:2458
FILE * seq
Definition: rns.c:46
static void error(const char *msg)
Definition: mkptypes.cxx:96
GBDATA * GBT_next_marked_species(GBDATA *gb_species)
Definition: aditem.cxx:116
vector< vector< string > > VecVecType
Definition: Cma.h:60
GB_ERROR GB_pop_transaction(GBDATA *gbd)
Definition: arbdb.cxx:2488
#define EXIT_FAILURE
Definition: arb_a2ps.c:157
GBDATA * runningDatabase()
Definition: dbconn.cxx:35
char * GB_read_string(GBDATA *gbd)
Definition: arbdb.cxx:879
#define assert(bed)
char * GBT_get_default_alignment(GBDATA *gb_main)
Definition: adali.cxx:675
GBDATA * gb_main
Definition: adname.cxx:33
GB_CSTR GBT_get_name_or_description(GBDATA *gb_item)
Definition: aditem.cxx:437
GBDATA * GB_entry(GBDATA *father, const char *key)
Definition: adquery.cxx:334