ARB
Analyser.cxx
Go to the documentation of this file.
1 /*
2  * Analyser.cpp
3  *
4  * Responsible for interacting with the Cma code and Arb.
5  *
6  * Created on: Feb 15, 2010
7  * Author: breno
8  *
9  * Institute of Microbiology (Technical University Munich)
10  * http://www.arb-home.de/
11  */
12 
13 #include "Analyser.h"
14 
15 #include <ctime>
16 #include <iostream>
17 #include <iterator>
18 
22 Analyser::Analyser(GBDATA *gb_main) :
23  loader(NULp),
24  cma(NULp)
25 {
26 
27  //Definition of the alphabet
28  vector<string> alphabet(0);
29  alphabet.push_back("A");
30  alphabet.push_back("C");
31  alphabet.push_back("G");
32  alphabet.push_back("T");
33 
34  loader = new AlignedSequenceLoader(gb_main);
35  if (!loader->has_error()) {
36  VecVecType *seqs = loader->getSequences();
37  cma = new Cma(alphabet, seqs->size());
38  }
39 }
40 
44 Analyser::~Analyser() {
45  delete loader;
46  delete cma;
47 }
48 
54 Cma* Analyser::getCma() {
55  arb_assert(!has_error());
56  return cma;
57 }
58 
64 AlignedSequenceLoader* Analyser::getLoader() {
65  return loader;
66 }
67 
76 GB_ERROR Analyser::saveSAI(GBDATA *gb_main, vector<size_t> clusters, double threshold) {
78 
79  if (!error) {
80  char *al_name = GBT_get_default_alignment(gb_main);
81  if (!al_name) {
82  error = GB_await_error();
83  }
84  else {
85  vector<char> clusts = normalizeClusters(clusters);
86  // build result string
87  stringstream ss;
88  copy(clusts.begin(), clusts.end(), ostream_iterator<char> (ss, ""));
89  string result = ss.str();
90 
91  //save
92  GBDATA *gb_sai = GBT_find_or_create_SAI(gb_main, getSAIname());
93  GBDATA *gb_data = GBT_add_data(gb_sai, al_name, "data", GB_STRING);
94  error = GB_write_string(gb_data, result.c_str());
95 
96  if (!error) {
97  GBDATA *gb_options = GBT_add_data(gb_sai, al_name, "_TYPE", GB_STRING);
98  stringstream options;
99  options << "CMA_CLUSTERING: [threshold: " << threshold << "]";
100 
101  error = GB_write_string(gb_options, options.str().c_str());
102  }
103  free(al_name);
104  }
105  }
106  error = GB_end_transaction(gb_main, error);
107 
108  return error;
109 }
110 
123 vector<char> Analyser::normalizeClusters(vector<size_t> clusters) {
124 
125  vector<char> result;
126  map<unsigned int, char> rename_map;
127  rename_map[0] = '-';
128  char classes = '0';
129 
130  for (vector<size_t>::iterator it = clusters.begin(); it != clusters.end(); ++it) {
131  if (rename_map.find(*it) == rename_map.end()) {
132  rename_map[*it] = classes++;
133  }
134  result.push_back(rename_map[*it]);
135  }
136  return result;
137 
138 }
139 
140 //--------------------------------
141 
142 GB_ERROR run_rnacma(GBDATA *gb_main, bool interactive, double threshold) {
149  GB_ERROR error = NULp;
150  try {
151  Analyser a(gb_main);
152  error = a.get_error();
153 
154  if (!error) {
155  Cma *cma = a.getCma();
156 
157  cma->computeMutualInformationP(*(a.getLoader()->getSequences()));
158 
159  list<MITuple> mituple = cma->compute_mituples(cma->getMIp());
160 
161  cout << endl
162  << "The highest MI value was: " << mituple.begin()->MI
163  << " at position (" << mituple.begin()->pos1 << ", "
164  << mituple.begin()->pos2 << ")." << endl
165  << "(Note: pairs with MI-values down to the threshold will be joined in one cluster)" << endl;
166 
167  while (true) {
168  if (interactive) {
169  cout << endl
170  << "Press Ctrl-d to quit or" << endl
171  << "choose a threshold value for the clustering process: ";
172 
173 
174  string input;
175  cin >> input;
176 
177  if (input.empty()) {
178  cout << endl << "quit" << endl;
179  break;
180  }
181 
182  threshold = strtod(input.c_str(), NULp);
183  }
184  cout << "Building clusters with threshold = " << threshold << endl;
185 
186  VectorXi cl = cma->computeClusters(mituple, size_t(cma->getMIp().cols()), threshold);
187  vector<size_t> *clusters = new vector<size_t> (0, 0);
188  AlignedSequenceLoader *l = a.getLoader();
189  size_t i = 0;
190  size_t j = 0;
191  for (vector<size_t>::const_iterator it = l->getPositionMap().begin(); it != l->getPositionMap().end(); ++it) {
192  while (i < *it) {
193  clusters->push_back(0);
194  i++;
195  }
196  clusters->push_back(cl[j]);
197  j++;
198  i++;
199  }
200 
201  GB_ERROR e = a.saveSAI(gb_main, *clusters, threshold);
202  delete clusters;
203 
204  if (e) {
205  cout << "Error while saving SAI: " << e << endl;
206  throw e;
207  }
208  else {
209  cout << "Saved results to SAI '" << Analyser::getSAIname() << "'" << endl
210  << "(To analyse the results, open the editor and visualise the clusters using the SAI annotations)" << endl;
211  }
212 
213  if (!interactive) break;
214  }
215  }
216  }
217  catch (const char *err) {
218  error = GBS_global_string("caught exception: %s", err);
219  }
220  return error;
221 }
222 
223 int ARB_main(int /*argc*/, char */*argv*/[]) {
224  cout
225  << "arb_rnacma -- correlated mutation analysis" << endl
226  << " computes clusters of correlated positions" << endl
227  << "(C) 2010 The ARB-project" << endl
228  << "Written 2009/2010 by Breno Faria" << endl
229  << endl
230  << "arb_rnacma uses the eigen C++ library (http://eigen.tuxfamily.org/)" << endl
231  << "eigen is copyrighted by LGPL3" << endl
232  << endl;
233 
234  int exitcode = EXIT_SUCCESS;
235  {
236  GB_ERROR error = NULp;
237  {
238  GB_shell shell;
239  GBDATA *gb_main = GB_open(":", "rwt");
240 
241  if (!gb_main) {
242  error = GB_await_error();
243  }
244  else {
245  error = run_rnacma(gb_main, true, 0.0);
246  }
247  GB_close(gb_main);
248  }
249 
250  if (error) {
251  cout << endl << "Error in arb_rnacma: " << error << ". terminating." << endl;
252  exitcode = EXIT_FAILURE;
253  }
254  }
255 
256  return exitcode;
257 }
258 
259 // --------------------------------------------------------------------------------
260 
261 #ifdef UNIT_TESTS
262 #ifndef TEST_UNIT_H
263 #include <test_unit.h>
264 #endif
265 
266 void TEST_RNACMA() {
267  GB_shell shell;
268  {
269  const char *db = "TEST_nuc_short.arb"; // ../UNIT_TESTER/run/TEST_nuc_short.arb
270  const char *aliname = "ali_16s";
271  const char *sainame = "cma_clusters";
272 
273  GBDATA *gb_main;
274  TEST_EXPECT_RESULT__NOERROREXPORTED(gb_main = GB_open(db, "rw"));
275 
276  const int NO_OF_MARKED = 8;
277  TEST_EXPECT_EQUAL(GBT_count_marked_species(gb_main), NO_OF_MARKED);
278 
279  {
280  GB_transaction ta(gb_main);
281  TEST_EXPECT_NORESULT__NOERROREXPORTED(GBT_find_SAI(gb_main, sainame)); // expect SAI to be missing
282  }
283 
284  {
285  GB_transaction ta(gb_main);
286 
287  struct {
288  double threshold;
289  const char *expected_clusters_data;
290  } data[] = {
291  { 0.5, "--------------------------0-----10-----0---------------------23--041---000--5-55-----000060----------0600005---55575-7----5000140--3----0--0--------------2-6------0--0000--000--------0000------0-00----------0------0-00000-0-" },
292  { 0.666, "--------------------------0------1-----1----------------------2--13----456--7-88-----9::0;------------;004:8---888<7-<----8494-3---2----5--5----------------;----------16---199---------01---------------------9------:--::99---" },
293  { 0.833, "----------------------------------------------------------------------------0--1-----2-------------------------111-0---------------------------------------------------------3---------------------------------3------------2---" },
294  { 1.0, "-------------------------------------------------------------------------------0--------------------------------00-----------------------------------------------------------1---------------------------------1----------------" },
295  { 1.5, "--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------" },
296 
297  // output for testdata: "The highest MI value was: 1.48 at position (77, 106)."
298  { -1.0, NULp }
299  };
300 
301  TEST_EXPECT_NO_ERROR(run_rnacma(gb_main, false, data[0].threshold));
302 
303  GBDATA *gb_sai = GBT_find_SAI(gb_main, sainame);
304  TEST_REJECT_NULL(gb_sai);
305 
306  // test sai content:
307  GBDATA *gb_sai_data = GBT_find_sequence(gb_sai, aliname);
308  TEST_REJECT_NULL(gb_sai_data);
309 
310  int idx = 0;
311  while (1) {
312  TEST_ANNOTATE(GBS_global_string("idx=%i", idx));
313  const char *sai_data = GB_read_char_pntr(gb_sai_data);
314  TEST_EXPECT_EQUAL(sai_data, data[idx].expected_clusters_data);
315 
316  ++idx;
317  if (!data[idx].expected_clusters_data) break; // continue as long there are results to test
318 
319  TEST_EXPECT_NO_ERROR(run_rnacma(gb_main, false, data[idx].threshold));
320  }
321  }
322 
323  const int SOME_THRESHOLD = 0.5;
324 
325  // fail on invalid default alignment:
326  {
327  GB_transaction ta(gb_main);
328  TEST_EXPECT_NO_ERROR(GBT_set_default_alignment(gb_main, "ali_fantasy"));
329  TEST_EXPECT_ERROR_CONTAINS(run_rnacma(gb_main, false, SOME_THRESHOLD), "alignment 'ali_fantasy' not found");
330  ta.close("abort"); // undo change of default alignment
331  }
332 
333  // want failure if a marked sequence lacks data:
334  for (int fail = 1; fail<=2; ++fail) {
335  TEST_ANNOTATE(GBS_global_string("fail=%i", fail));
336 
337  GBDATA *gb_species;
338  {
339  GB_transaction ta(gb_main);
340 
341  gb_species = GBT_first_marked_species(gb_main);
342  GBDATA *gb_data = GBT_find_sequence(gb_species, aliname);
343 
344  switch (fail) {
345  case 1: {
346  GB_topSecurityLevel permit(gb_main);
348  break;
349  }
350  case 2: {
351  GBDATA *gb_ali = GB_get_father(gb_data);
353  break;
354  }
355  default: arb_assert(0); break;
356  }
357 
358  TEST_EXPECT_ERROR_CONTAINS(run_rnacma(gb_main, false, SOME_THRESHOLD), "species 'LcbPont2' has no data in selected alignment 'ali_16s'");
359 
360  ta.close("abort"); // undo changes
361  }
362  {
363  // somehow the species flag is reset by the above (aborted) transaction.
364  // @@@ could be a general database bug!? undo delete is known to be problematic (see #496)
365  GB_transaction ta(gb_main);
366  GB_write_flag(gb_species, 1);
367  }
368  }
369  TEST_ANNOTATE(NULp);
370  TEST_EXPECT_EQUAL(GBT_count_marked_species(gb_main), NO_OF_MARKED);
371 
372  // @@@ provoke some more errors?
373 
374  // test failure when no species marked:
375  {
376  GB_transaction ta(gb_main);
377  GBT_mark_all(gb_main, 0);
378  TEST_EXPECT_ERROR_CONTAINS(run_rnacma(gb_main, false, SOME_THRESHOLD), "caught exception: no (marked?) sequences");
379  ta.close("abort"); // undo changes
380  }
381 
382  // Note that marks are correctly restored by aborting TA
383  TEST_EXPECT_EQUAL(GBT_count_marked_species(gb_main), NO_OF_MARKED);
384 
385  GB_close(gb_main);
386  }
387 }
388 
389 #endif // UNIT_TESTS
390 
391 // --------------------------------------------------------------------------------
#define arb_assert(cond)
Definition: arb_assert.h:245
const char * GB_ERROR
Definition: arb_core.h:25
GB_ERROR run_rnacma(GBDATA *gb_main, bool interactive, double threshold)
Definition: Analyser.cxx:142
string result
GBDATA * GB_open(const char *path, const char *opent)
Definition: ad_load.cxx:1363
GBDATA * GBT_first_marked_species(GBDATA *gb_main)
Definition: aditem.cxx:113
GB_ERROR GB_write_string(GBDATA *gbd, const char *s)
Definition: arbdb.cxx:1387
long GBT_mark_all(GBDATA *gb_main, int flag)
Definition: aditem.cxx:295
GB_ERROR GB_end_transaction(GBDATA *gbd, GB_ERROR error)
Definition: arbdb.cxx:2561
const char * GBS_global_string(const char *templat,...)
Definition: arb_msg.cxx:203
#define EXIT_SUCCESS
Definition: arb_a2ps.c:154
GBDATA * GB_get_father(GBDATA *gbd)
Definition: arbdb.cxx:1722
GB_ERROR GB_push_transaction(GBDATA *gbd)
Definition: arbdb.cxx:2494
GB_ERROR GB_delete(GBDATA *&source)
Definition: arbdb.cxx:1916
GBDATA * GBT_find_SAI(GBDATA *gb_main, const char *name)
Definition: aditem.cxx:177
NOT4PERL GBDATA * GBT_add_data(GBDATA *species, const char *ali_name, const char *key, GB_TYPES type) __ATTR__DEPRECATED_TODO("better use GBT_create_sequence_data()")
Definition: adali.cxx:597
GB_ERROR GB_await_error()
Definition: arb_msg.cxx:342
GB_ERROR GBT_set_default_alignment(GBDATA *gb_main, const char *alignment_name)
Definition: adali.cxx:765
#define TEST_REJECT_NULL(n)
Definition: test_unit.h:1325
static void error(const char *msg)
Definition: mkptypes.cxx:96
AlignedSequenceLoader(class GBDATA *gb_main)
vector< vector< string > > VecVecType
Definition: Cma.h:60
GBDATA * GBT_find_sequence(GBDATA *gb_species, const char *aliname)
Definition: adali.cxx:708
#define EXIT_FAILURE
Definition: arb_a2ps.c:157
long GBT_count_marked_species(GBDATA *gb_main)
Definition: aditem.cxx:372
static void copy(double **i, double **j)
Definition: trnsprob.cxx:32
GB_ERROR close(GB_ERROR error)
Definition: arbdbpp.cxx:35
void GB_write_flag(GBDATA *gbd, long flag)
Definition: arbdb.cxx:2773
Cma(vector< string > &alph, int seq_number)
#define TEST_EXPECT_NO_ERROR(call)
Definition: test_unit.h:1118
int ARB_main(int, char *[])
Definition: Analyser.cxx:223
#define NULp
Definition: cxxforward.h:116
#define TEST_EXPECT_ERROR_CONTAINS(call, part)
Definition: test_unit.h:1114
char * GBT_get_default_alignment(GBDATA *gb_main)
Definition: adali.cxx:747
GBDATA * GBT_find_or_create_SAI(GBDATA *gb_main, const char *name)
Definition: aditem.cxx:65
GB_transaction ta(gb_var)
GB_CSTR GB_read_char_pntr(GBDATA *gbd)
Definition: arbdb.cxx:904
GBDATA * gb_main
Definition: adname.cxx:32
#define TEST_EXPECT_EQUAL(expr, want)
Definition: test_unit.h:1294
void GB_close(GBDATA *gbd)
Definition: arbdb.cxx:655