ARB
Cma.cxx
Go to the documentation of this file.
1 /*
2  * Cma.cpp
3  *
4  * Created on: Dec 16, 2009
5  * Author: Breno Faria
6  *
7  * Institute of Microbiology (Technical University Munich)
8  * http://www.arb-home.de/
9  */
10 
11 #include <iostream>
12 #include <iomanip>
13 #include <time.h>
14 
15 #include "Cma.h"
16 
17 //>--------------------------Global definitions--------------------------------
18 
19 // Some local function prototypes.
20 void unifyCluster(int cluster1, int cluster2, VectorXi & result);
21 
22 //>-----------------------local helper functions-------------------------------
23 
34 static bool compareMITuples(MITuple first, MITuple second) {
35  return first.MI > second.MI;
36 }
37 
45 void unifyCluster(int cluster1, int cluster2, VectorXi & result) {
46  size_t size = result.size();
47  for (size_t i = 0; i < size; ++i) {
48  if (result[i] == cluster2) {
49  result[i] = cluster1;
50  }
51  }
52 }
53 
54 //>---------------------------Public methods-----------------------------------
55 
65 Cma::Cma(vector<string> & alph, int seq_num) {
66 
67  initAlphabet(alph);
68  num_of_seqs = seq_num;
69 
70 }
71 
75 Cma::~Cma() {
76 }
77 
92 MatrixXd Cma::computeMutualInformation(VecVecType & seq) {
93 
94  JointEntropy = computeJointEntropy(seq);
95 
96  JointEntropy = JointEntropy + JointEntropy.adjoint() - JointEntropy.diagonal().asDiagonal();
97 
98  entropy = JointEntropy.diagonal();
99 
100  cout << "computing Mutual Information... ";
101  flush(cout);
102  clock_t start = clock();
103 
104  VectorXd Ones = VectorXd::Ones(entropy.size());
105  MatrixXd Hx = (entropy * Ones.transpose()).transpose();
106  MatrixXd Hy = Hx.transpose();
107 
108  MutualInformation = Hx + Hy - JointEntropy;
109 
110  cout << "completed in " << ((double) clock() - start) / CLOCKS_PER_SEC
111  << "s." << endl;
112 
113  return MutualInformation;
114 
115 }
116 
139 MatrixXd Cma::computeMutualInformationP(VecVecType & seq) {
140 
141  if (MutualInformation.size() == 0) {
142  computeMutualInformation(seq);
143  }
144 
145  cout << "computing noiseless Mutual Information (MIp)... ";
146  flush(cout);
147  clock_t start = clock();
148  /*
149  * We must remove negative MI values (the negative values are artificially
150  * generated, see comments in computeJointEntropy).
151  */
152  for (int i = 0; i < MutualInformation.rows(); ++i) {
153  for (int j = 0; j < MutualInformation.cols(); ++j) {
154  if (MutualInformation(i, j) < 0.) {
155  MutualInformation(i, j) = 0.;
156  }
157  }
158  }
159 
160  VectorXd mMIs = computeMeanMutualInformationVector();
161  double mMI = computeOveralMeanMutualInformation();
162 
163  MutualInformationP = MutualInformation - ((mMIs * mMIs.transpose()) / mMI);
164 
165  cout << "completed in " << ((double) clock() - start) / CLOCKS_PER_SEC
166  << "s." << endl;
167 
168  return MutualInformationP;
169 
170 }
171 
179 list<MITuple> Cma::compute_mituples(MatrixXd mutualInformation) {
180  size_t size = mutualInformation.cols();
181 
182  list<MITuple> mituples;
183 
184  // populate the mituples list with all entries in the upper triangular matrix
185  // of mutualInformation.
186  for (size_t i = 0; i < size; ++i) {
187  for (size_t j = i + 1; j < size; ++j) {
188  MITuple curr;
189  curr.MI = mutualInformation(i, j);
190  curr.pos1 = i;
191  curr.pos2 = j;
192  mituples.push_back(curr);
193  }
194  }
195 
196  // sort mituples by MI-value
197  mituples.sort(compareMITuples);
198 
199  return mituples;
200 }
201 
216 VectorXi Cma::computeClusters(list<MITuple> mituples, size_t size,
217  double threshold) {
218 
219  cout << "computing clusters of correlated positions... ";
220  flush(cout);
221 
222  VectorXi result = VectorXi::Zero(size);
223 
224  //compute the clusters.
225  int cluster = 1;
226  int rest = int(size);
227  for (list<MITuple>::iterator it = mituples.begin(); it != mituples.end(); ++it) {
228  double result1 = abs(result[it->pos1]);
229  double result2 = abs(result[it->pos2]);
230  if (it->MI <= threshold || rest == 0) {
231  break;
232  }
233  if (result1 < epsilon and result2 < epsilon) {
234  result[it->pos1] = cluster;
235  result[it->pos2] = cluster++;
236  rest -= 2;
237  }
238  else if (result1 > epsilon and result2 < epsilon) {
239  result[it->pos2] = result[it->pos1];
240  rest -= 1;
241  }
242  else if (result1 < epsilon and result2 > epsilon) {
243  result[it->pos1] = result[it->pos2];
244  rest -= 1;
245  }
246  else if (result1 > epsilon and result2 > epsilon && result1 - result2 > epsilon) {
247  unifyCluster(result[it->pos1], result[it->pos2], result);
248  }
249  }
250 
251  clusters = result;
252 
253  cout << "done." << endl;
254 
255  return result;
256 }
257 
258 //>-------------------------getters and setters--------------------------------
259 
260 MatrixXd Cma::getEntropy() {
261 
262  return entropy;
263 
264 }
265 
266 MatrixXd Cma::getJointEntropy() {
267 
268  return JointEntropy;
269 
270 }
271 
272 MatrixXd Cma::getMI() {
273 
274  return MutualInformation;
275 
276 }
277 
278 MatrixXd Cma::getMIp() {
279 
280  return MutualInformationP;
281 
282 }
283 
284 VectorXi Cma::getClusters() {
285 
286  return clusters;
287 
288 }
289 
290 //>----------------------------private methods---------------------------------
291 
295 void Cma::initAlphabet(vector<string> alph) {
296 
297  alphabet = vector<string> (alph);
298 
299  int i = 0;
300  for (vector<string>::iterator it = alph.begin(); it != alph.end(); ++it) {
301  alphabet_map[*it] = i;
302  i++;
303  }
304  for (vector<string>::iterator it = alph.begin(); it != alph.end(); ++it) {
305  for (vector<string>::iterator it2 = alph.begin(); it2 != alph.end(); ++it2) {
306  alphabet_map[*it + *it2] = i;
307  i++;
308  }
309  }
310  alphabet_map["total"] = i;
311 }
312 
330 MatrixXd Cma::computeJointEntropy(const VecVecType & seqs) {
331 
332  MapMatrixType hist = buildJointHistogram(seqs);
333  return Cma::computeJointEntropy(hist);
334 
335 }
336 
347 MatrixXd Cma::computeJointEntropy(MapMatrixType & hist) {
348 
349  cout << "computing joint entropy (this may take a while)... ";
350  flush(cout);
351  clock_t start = clock();
352 
353  int hist_size = int(hist.size());
354  MatrixXd result(hist_size, hist_size);
355  result.setZero(hist_size, hist_size);
356  for (unsigned int i = 0; i < (unsigned int) hist_size; ++i) {
357 
358  //progress "bar"
359  if (hist_size > 30 and i % (hist_size / 30) == 0) {
360  cout << "(" << setw(6) << setiosflags(ios::fixed)
361  << setprecision(2) << i / float(hist_size) * 100 << "%)";
362  flush(cout);
363  }
364 
365  for (unsigned int j = i; j < hist[i].size(); ++j) {
366  int total_i_j = hist[i][j][alphabet_map.at("total")];
367  int total_i_i = hist[i][i][alphabet_map.at("total")];
368  int total_j_j = hist[j][j][alphabet_map.at("total")];
369  int mismatches = total_i_i + total_j_j - 2 * total_i_j;
370  if (total_i_j != 0) {
371  double result_i_j = 0.;
372  for (map<int, int>::iterator it = hist[i][j].begin(); it
373  != hist[i][j].end(); ++it) {
374  double pair = double(it->second) + Cma::epsilon;
375  result_i_j += (pair / total_i_j) * log2(pair / total_i_j);
376  }
377 
378  result(i, j) = -result_i_j + double(mismatches) / (total_i_i
379  + total_j_j) * Cma::penalty;
380 
381  } else {
390  result(i, j) = 5.;
391  }
392 
393  }
394  //progress "bar"
395  if (hist_size > 30 and i % (hist_size / 30) == 0) {
396  cout << "\b\b\b\b\b\b\b\b\b";
397  }
398  }
399  flush(cout);
400  cout << "completed in " << ((double) clock() - start) / CLOCKS_PER_SEC
401  << "s." << endl;
402  return result;
403 
404 }
405 
413 bool Cma::isValid(string & key) {
414 
415  map<string, int>::iterator it = alphabet_map.find(key);
416  return it != alphabet_map.end();
417 
418 }
419 
430 MapMatrixType Cma::buildJointHistogram(const VecVecType & alignedSequences) {
431 
432  cout << "building histogram (this may take a while)... ";
433  flush(cout);
434  clock_t start = clock();
435 
436  if (alignedSequences.empty()) {
437  throw "no (marked?) sequences";
438  }
439 
440  // here we defined one dimension of the matrix as being the same as the
441  // length of the sequences.
442  MapMatrixType result(alignedSequences[0].size(), MapVecType(
443  alignedSequences[0].size()));
444 
445  int ii = 0;
446  // for each sequence:
447  for (VecVecType::const_iterator it = alignedSequences.begin(); it
448  != alignedSequences.end(); ++it) {
449 
450  cout << "(" << setw(6) << setiosflags(ios::fixed) << setprecision(2)
451  << ii / float(num_of_seqs) * 100 << "%)";
452  flush(cout);
453 
454  // for each first position in the sequence
455  for (size_t i = 0; i < it->size(); ++i) {
456  string key1 = it->at(i);
457  if (isValid(key1)) {
458 
459  //for each second position in the sequence
460  for (size_t j = i; j < it->size(); ++j) {
461  string key2 = it->at(j);
462  if (isValid(key2)) {
463  result[i][j][alphabet_map.at(key1 + key2)] += 1;
464  result[i][j][alphabet_map.at("total")] += 1;
465  }
466  }
467  }
468  }
469  cout << "\b\b\b\b\b\b\b\b\b";
470  ii++;
471  }
472  flush(cout);
473  cout << "completed in " << ((double) clock() - start) / CLOCKS_PER_SEC
474  << "s." << endl;
475 
476  return result;
477 }
478 
487 VectorXd Cma::computeMeanMutualInformationVector() {
488 
489  return MutualInformation.rowwise().sum() / num_of_seqs;
490 
491 }
492 
501 double Cma::computeOveralMeanMutualInformation() {
502 
503  return 2. * MutualInformation.sum() / num_of_seqs / num_of_seqs;
504 
505 }
506 
string result
static bool compareMITuples(MITuple first, MITuple second)
Definition: Cma.cxx:34
int pos1
Definition: Cma.h:66
FILE * seq
Definition: rns.c:46
static HelixNrInfo * start
Definition: Cma.h:65
int pos2
Definition: Cma.h:67
static double getEntropy(double **E, double m, int l)
Definition: align.cxx:150
vector< vector< string > > VecVecType
Definition: Cma.h:60
vector< map< int, int > > MapVecType
Definition: Cma.h:58
double MI
Definition: Cma.h:68
#define epsilon
Definition: DI_protdist.cxx:17
void unifyCluster(int cluster1, int cluster2, VectorXi &result)
Definition: Cma.cxx:45
#define abs(x)
Definition: f2c.h:151
vector< MapVecType > MapMatrixType
Definition: Cma.h:62