ARB
Analyser.cxx
Go to the documentation of this file.
1 /*
2  * Analyser.cpp
3  *
4  * Responsible for interacting with the Cma code and Arb.
5  *
6  * Created on: Feb 15, 2010
7  * Author: breno
8  *
9  * Institute of Microbiology (Technical University Munich)
10  * http://www.arb-home.de/
11  */
12 
13 #include "Analyser.h"
14 
15 #include <ctime>
16 #include <iostream>
17 #include <iterator>
18 
22 Analyser::Analyser(GBDATA *gb_main) :
23  loader(NULp),
24  cma(NULp)
25 {
26 
27  //Definition of the alphabet
28  vector<string> alphabet(0);
29  alphabet.push_back("A");
30  alphabet.push_back("C");
31  alphabet.push_back("G");
32  alphabet.push_back("T");
33 
34  loader = new AlignedSequenceLoader(gb_main);
35  if (!loader->has_error()) {
36  VecVecType *seqs = loader->getSequences();
37  cma = new Cma(alphabet, seqs->size());
38  }
39 }
40 
44 Analyser::~Analyser() {
45  delete loader;
46  delete cma;
47 }
48 
54 Cma* Analyser::getCma() {
55  arb_assert(!has_error());
56  return cma;
57 }
58 
64 AlignedSequenceLoader* Analyser::getLoader() {
65  return loader;
66 }
67 
76 GB_ERROR Analyser::saveSAI(GBDATA *gb_main, vector<size_t> clusters, double threshold) {
78 
79  if (!error) {
80  char *al_name = GBT_get_default_alignment(gb_main);
81 
82  vector<char> clusts = normalizeClusters(clusters);
83  // build result string
84  stringstream ss;
85  copy(clusts.begin(), clusts.end(), ostream_iterator<char> (ss, ""));
86  string result = ss.str();
87 
88  //save
89  GBDATA *gb_sai = GBT_find_or_create_SAI(gb_main, getSAIname());
90  GBDATA *gb_data = GBT_add_data(gb_sai, al_name, "data", GB_STRING);
91  error = GB_write_string(gb_data, result.c_str());
92 
93  if (!error) {
94  GBDATA *gb_options = GBT_add_data(gb_sai, al_name, "_TYPE", GB_STRING);
95  stringstream options;
96  options << "CMA_CLUSTERING: [threshold: " << threshold << "]";
97 
98  error = GB_write_string(gb_options, options.str().c_str());
99  }
100  free(al_name);
101  }
102  error = GB_end_transaction(gb_main, error);
103 
104  return error;
105 }
106 
119 vector<char> Analyser::normalizeClusters(vector<size_t> clusters) {
120 
121  vector<char> result;
122  map<unsigned int, char> rename_map;
123  rename_map[0] = '-';
124  char classes = '0';
125 
126  for (vector<size_t>::iterator it = clusters.begin(); it != clusters.end(); ++it) {
127  if (rename_map.find(*it) == rename_map.end()) {
128  rename_map[*it] = classes++;
129  }
130  result.push_back(rename_map[*it]);
131  }
132  return result;
133 
134 }
135 
136 //--------------------------------
137 
138 GB_ERROR run_rnacma(GBDATA *gb_main, bool interactive, double threshold) {
145  GB_ERROR error = NULp;
146  try {
147  Analyser a(gb_main);
148  error = a.get_error();
149 
150  if (!error) {
151  Cma *cma = a.getCma();
152 
153  cma->computeMutualInformationP(*(a.getLoader()->getSequences()));
154 
155  list<MITuple> mituple = cma->compute_mituples(cma->getMIp());
156 
157  cout << endl
158  << "The highest MI value was: " << mituple.begin()->MI
159  << " at position (" << mituple.begin()->pos1 << ", "
160  << mituple.begin()->pos2 << ")." << endl
161  << "(Note: pairs with MI-values down to the threshold will be joined in one cluster)" << endl;
162 
163  while (true) {
164  if (interactive) {
165  cout << endl
166  << "Press Ctrl-d to quit or" << endl
167  << "choose a threshold value for the clustering process: ";
168 
169 
170  string input;
171  cin >> input;
172 
173  if (input.empty()) {
174  cout << endl << "quit" << endl;
175  break;
176  }
177 
178  threshold = strtod(input.c_str(), NULp);
179  }
180  cout << "Building clusters with threshold = " << threshold << endl;
181 
182  VectorXi cl = cma->computeClusters(mituple, size_t(cma->getMIp().cols()), threshold);
183  vector<size_t> *clusters = new vector<size_t> (0, 0);
184  AlignedSequenceLoader *l = a.getLoader();
185  size_t i = 0;
186  size_t j = 0;
187  for (vector<size_t>::const_iterator it = l->getPositionMap().begin(); it != l->getPositionMap().end(); ++it) {
188  while (i < *it) {
189  clusters->push_back(0);
190  i++;
191  }
192  clusters->push_back(cl[j]);
193  j++;
194  i++;
195  }
196 
197  GB_ERROR e = a.saveSAI(gb_main, *clusters, threshold);
198  delete clusters;
199 
200  if (e) {
201  cout << "Error while saving SAI: " << e << endl;
202  throw e;
203  }
204  else {
205  cout << "Saved results to SAI '" << Analyser::getSAIname() << "'" << endl
206  << "(To analyse the results, open the editor and visualise the clusters using the SAI annotations)" << endl;
207  }
208 
209  if (!interactive) break;
210  }
211  }
212  }
213  catch (const char *err) {
214  error = GBS_global_string("caught exception: %s", err);
215  }
216  return error;
217 }
218 
219 int ARB_main(int /*argc*/, char */*argv*/[]) {
220  cout
221  << "arb_rnacma -- correlated mutation analysis" << endl
222  << " computes clusters of correlated positions" << endl
223  << "(C) 2010 The ARB-project" << endl
224  << "Written 2009/2010 by Breno Faria" << endl
225  << endl
226  << "arb_rnacma uses the eigen C++ library (http://eigen.tuxfamily.org/)" << endl
227  << "eigen is copyrighted by LGPL3" << endl
228  << endl;
229 
230  int exitcode = EXIT_SUCCESS;
231  {
232  GB_ERROR error = NULp;
233  {
234  GB_shell shell;
235  GBDATA *gb_main = GB_open(":", "rwt");
236 
237  if (!gb_main) {
238  error = GB_await_error();
239  }
240  else {
241  error = run_rnacma(gb_main, true, 0.0);
242  }
243  GB_close(gb_main);
244  }
245 
246  if (error) {
247  cout << endl << "Error in arb_rnacma: " << error << ". terminating." << endl;
248  exitcode = EXIT_FAILURE;
249  }
250  }
251 
252  return exitcode;
253 }
254 
255 // --------------------------------------------------------------------------------
256 
257 #ifdef UNIT_TESTS
258 #ifndef TEST_UNIT_H
259 #include <test_unit.h>
260 #endif
261 
262 void TEST_RNACMA() {
263  GB_shell shell;
264  {
265  const char *db = "TEST_nuc_short.arb"; // ../UNIT_TESTER/run/TEST_nuc_short.arb
266  const char *aliname = "ali_16s";
267  const char *sainame = "cma_clusters";
268 
269  GBDATA *gb_main;
270  TEST_EXPECT_RESULT__NOERROREXPORTED(gb_main = GB_open(db, "rw"));
271 
272  const int NO_OF_MARKED = 8;
273  TEST_EXPECT_EQUAL(GBT_count_marked_species(gb_main), NO_OF_MARKED);
274 
275  {
276  GB_transaction ta(gb_main);
277  TEST_EXPECT_NORESULT__NOERROREXPORTED(GBT_find_SAI(gb_main, sainame)); // expect SAI to be missing
278  }
279 
280  {
281  GB_transaction ta(gb_main);
282 
283  struct {
284  double threshold;
285  const char *expected_clusters_data;
286  } data[] = {
287  { 0.5, "--------------------------0-----10-----0---------------------23--041---000--5-55-----000060----------0600005---55575-7----5000140--3----0--0--------------2-6------0--0000--000--------0000------0-00----------0------0-00000-0-" },
288  { 0.666, "--------------------------0------1-----1----------------------2--13----456--7-88-----9::0;------------;004:8---888<7-<----8494-3---2----5--5----------------;----------16---199---------01---------------------9------:--::99---" },
289  { 0.833, "----------------------------------------------------------------------------0--1-----2-------------------------111-0---------------------------------------------------------3---------------------------------3------------2---" },
290  { 1.0, "-------------------------------------------------------------------------------0--------------------------------00-----------------------------------------------------------1---------------------------------1----------------" },
291  { 1.5, "--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------" },
292 
293  // output for testdata: "The highest MI value was: 1.48 at position (77, 106)."
294  { -1.0, NULp }
295  };
296 
297  TEST_EXPECT_NO_ERROR(run_rnacma(gb_main, false, data[0].threshold));
298 
299  GBDATA *gb_sai = GBT_find_SAI(gb_main, sainame);
300  TEST_REJECT_NULL(gb_sai);
301 
302  // test sai content:
303  GBDATA *gb_sai_data = GBT_find_sequence(gb_sai, aliname);
304  TEST_REJECT_NULL(gb_sai_data);
305 
306  int idx = 0;
307  while (1) {
308  TEST_ANNOTATE(GBS_global_string("idx=%i", idx));
309  const char *sai_data = GB_read_char_pntr(gb_sai_data);
310  TEST_EXPECT_EQUAL(sai_data, data[idx].expected_clusters_data);
311 
312  ++idx;
313  if (!data[idx].expected_clusters_data) break; // continue as long there are results to test
314 
315  TEST_EXPECT_NO_ERROR(run_rnacma(gb_main, false, data[idx].threshold));
316  }
317  }
318 
319  const int SOME_THRESHOLD = 0.5;
320 
321  // fail on invalid default alignment:
322  {
323  GB_transaction ta(gb_main);
324  TEST_EXPECT_NO_ERROR(GBT_set_default_alignment(gb_main, "ali_fantasy"));
325  TEST_EXPECT_ERROR_CONTAINS(run_rnacma(gb_main, false, SOME_THRESHOLD), "alignment 'ali_fantasy' not found");
326  ta.close("abort"); // undo change of default alignment
327  }
328 
329  // want failure if a marked sequence lacks data:
330  for (int fail = 1; fail<=2; ++fail) {
331  TEST_ANNOTATE(GBS_global_string("fail=%i", fail));
332 
333  GBDATA *gb_species;
334  {
335  GB_transaction ta(gb_main);
336 
337  gb_species = GBT_first_marked_species(gb_main);
338  GBDATA *gb_data = GBT_find_sequence(gb_species, aliname);
339 
340  switch (fail) {
341  case 1: {
342  GB_topSecurityLevel permit(gb_main);
344  break;
345  }
346  case 2: {
347  GBDATA *gb_ali = GB_get_father(gb_data);
349  break;
350  }
351  default: arb_assert(0); break;
352  }
353 
354  TEST_EXPECT_ERROR_CONTAINS(run_rnacma(gb_main, false, SOME_THRESHOLD), "species 'LcbPont2' has no data in selected alignment 'ali_16s'");
355 
356  ta.close("abort"); // undo changes
357  }
358  {
359  // somehow the species flag is reset by the above (aborted) transaction.
360  // @@@ could be a general database bug!? undo delete is known to be problematic (see #496)
361  GB_transaction ta(gb_main);
362  GB_write_flag(gb_species, 1);
363  }
364  }
365  TEST_ANNOTATE(NULp);
366  TEST_EXPECT_EQUAL(GBT_count_marked_species(gb_main), NO_OF_MARKED);
367 
368  // @@@ provoke some more errors?
369 
370  // test failure when no species marked:
371  {
372  GB_transaction ta(gb_main);
373  GBT_mark_all(gb_main, 0);
374  TEST_EXPECT_ERROR_CONTAINS(run_rnacma(gb_main, false, SOME_THRESHOLD), "caught exception: no (marked?) sequences");
375  ta.close("abort"); // undo changes
376  }
377 
378  // Note that marks are correctly restored by aborting TA
379  TEST_EXPECT_EQUAL(GBT_count_marked_species(gb_main), NO_OF_MARKED);
380 
381  GB_close(gb_main);
382  }
383 }
384 
385 #endif // UNIT_TESTS
386 
387 // --------------------------------------------------------------------------------
#define arb_assert(cond)
Definition: arb_assert.h:245
const char * GB_ERROR
Definition: arb_core.h:25
GB_ERROR run_rnacma(GBDATA *gb_main, bool interactive, double threshold)
Definition: Analyser.cxx:138
string result
GBDATA * GB_open(const char *path, const char *opent)
Definition: ad_load.cxx:1363
GBDATA * GBT_first_marked_species(GBDATA *gb_main)
Definition: aditem.cxx:113
GB_ERROR GB_write_string(GBDATA *gbd, const char *s)
Definition: arbdb.cxx:1381
GB_ERROR GB_end_transaction(GBDATA *gbd, GB_ERROR error)
Definition: arbdb.cxx:2544
const char * GBS_global_string(const char *templat,...)
Definition: arb_msg.cxx:202
#define EXIT_SUCCESS
Definition: arb_a2ps.c:154
GBDATA * GB_get_father(GBDATA *gbd)
Definition: arbdb.cxx:1716
GB_ERROR GB_push_transaction(GBDATA *gbd)
Definition: arbdb.cxx:2477
GB_ERROR GB_delete(GBDATA *&source)
Definition: arbdb.cxx:1899
GBDATA * GBT_find_SAI(GBDATA *gb_main, const char *name)
Definition: aditem.cxx:177
NOT4PERL GBDATA * GBT_add_data(GBDATA *species, const char *ali_name, const char *key, GB_TYPES type) __ATTR__DEPRECATED_TODO("better use GBT_create_sequence_data()")
Definition: adali.cxx:571
GB_ERROR GB_await_error()
Definition: arb_msg.cxx:341
GB_ERROR GBT_set_default_alignment(GBDATA *gb_main, const char *alignment_name)
Definition: adali.cxx:692
#define TEST_REJECT_NULL(n)
Definition: test_unit.h:1310
static void error(const char *msg)
Definition: mkptypes.cxx:96
AlignedSequenceLoader(class GBDATA *gb_main)
vector< vector< string > > VecVecType
Definition: Cma.h:60
GBDATA * GBT_find_sequence(GBDATA *gb_species, const char *aliname)
Definition: adali.cxx:682
#define EXIT_FAILURE
Definition: arb_a2ps.c:157
long GBT_count_marked_species(GBDATA *gb_main)
Definition: aditem.cxx:353
static void copy(double **i, double **j)
Definition: trnsprob.cxx:32
GB_ERROR close(GB_ERROR error)
Definition: arbdbpp.cxx:32
void GB_write_flag(GBDATA *gbd, long flag)
Definition: arbdb.cxx:2756
Cma(vector< string > &alph, int seq_number)
#define TEST_EXPECT_NO_ERROR(call)
Definition: test_unit.h:1107
int ARB_main(int, char *[])
Definition: Analyser.cxx:219
#define NULp
Definition: cxxforward.h:97
#define TEST_EXPECT_ERROR_CONTAINS(call, part)
Definition: test_unit.h:1103
char * GBT_get_default_alignment(GBDATA *gb_main)
Definition: adali.cxx:687
void GBT_mark_all(GBDATA *gb_main, int flag)
Definition: aditem.cxx:295
GBDATA * GBT_find_or_create_SAI(GBDATA *gb_main, const char *name)
Definition: aditem.cxx:65
GB_transaction ta(gb_var)
GB_CSTR GB_read_char_pntr(GBDATA *gbd)
Definition: arbdb.cxx:898
GBDATA * gb_main
Definition: adname.cxx:33
#define TEST_EXPECT_EQUAL(expr, want)
Definition: test_unit.h:1283
void GB_close(GBDATA *gbd)
Definition: arbdb.cxx:649