ARB
Cma.h
Go to the documentation of this file.
1 /*
2  * Cma.h
3  *
4  * CMA stands for correlated mutation analysis. This class implements
5  * information-theoretical measures to compute correlations between
6  * positions in a set of aligned sequences.
7  * The correlation values computed are Mutual Information (MI) and noiseless
8  * Mutual Information (MIp). MIp is a correlation measure proposed in the
9  * paper "Mutual information without the influence of phylogeny or entropy
10  * dramatically improves residue contact prediction" by S. D. Dunn et al.,
11  * Bioinformatics Vol. 24 no. 3, 333-340, 2008.
12  * The clustering of correlated positions follows a simple heuristic, which
13  * can be found in Thomas Runkler, "Information Mining", chapter 5, 53-58,
14  * 2000.
15  *
16  * Created on: Dec 16, 2009
17  * Author: Breno Faria
18  *
19  * Institute of Microbiology (Technical University Munich)
20  * http://www.arb-home.de/
21  */
22 
23 
24 #ifndef CMA_H
25 #define CMA_H
26 
27 #ifndef _GLIBCXX_IOSTREAM
28 #include <iostream>
29 #endif
30 #ifndef _GLIBCXX_VECTOR
31 #include <vector>
32 #endif
33 #ifndef _GLIBCXX_MAP
34 #include <map>
35 #endif
36 #ifndef _GLIBCXX_CMATH
37 #include <cmath>
38 #endif
39 #ifndef _GLIBCXX_STRING
40 #include <string>
41 #endif
42 #ifndef _GLIBCXX_LIST
43 #include <list>
44 #endif
45 
46 #include <eigen/Eigen/Eigen>
47 
48 #ifndef CXXFORWARD_H
49 #include <cxxforward.h>
50 #endif
51 
52 using namespace std; // @@@ wtf! this really is a nono
53 using Eigen::VectorXd;
54 using Eigen::VectorXi;
55 using Eigen::MatrixXd;
56 
57 // Vector of maps to store the histogram of sequences.
58 typedef vector<map<int, int> > MapVecType;
59 // Vector of vectors to store the aligned sequences (each one represented by a vector)
60 typedef vector<vector<string> > VecVecType; // @@@ string always is length 1. this should be vector<vector<char>> or vector<string>
61 // Matrix of maps to store the joint histogram of sequences.
62 typedef vector<MapVecType> MapMatrixType;
63 
64 // Struct representing the tuple (pos1, pos2, mutualInformation(pos1,pos2)).
65 struct MITuple {
66  int pos1;
67  int pos2;
68  double MI;
69 };
70 
71 class Cma FINAL_TYPE {
72 
73 private:
77  static CONSTEXPR double epsilon = 1.0e-9;
81  static CONSTEXPR double penalty = 5.;
85  vector<string> alphabet; // @@@ string always is length 1. this should be vector<char>
90  map<string, int> alphabet_map; // @@@ string is length 1 most of the time, disregarding alphabet_map["total"] which is used as global counter. this should be vector<char>
94  int num_of_seqs;
98  VectorXd entropy;
102  MatrixXd JointEntropy;
106  MatrixXd MutualInformation;
113  MatrixXd MutualInformationP;
117  VectorXi clusters;
118 
122  void initAlphabet(vector<string> alph);
126  MapMatrixType buildJointHistogram(const VecVecType & alignedSequences);
130  MatrixXd computeJointEntropy(MapMatrixType & hist);
131  MatrixXd computeJointEntropy(const VecVecType & seq);
135  VectorXd computeMeanMutualInformationVector();
136  double computeOveralMeanMutualInformation();
140  bool isValid(string & key1);
141 
142 
143 public:
144 
148  MatrixXd computeMutualInformation(VecVecType & seq);
152  MatrixXd computeMutualInformationP(VecVecType & seq);
156  VectorXi computeClusters(list<MITuple> mituples, size_t size, double threshold);
161  list<MITuple> compute_mituples(MatrixXd mutualInformation);
162 
166  MatrixXd getMI();
167  MatrixXd getMIp();
168  MatrixXd getEntropy();
169  MatrixXd getJointEntropy();
170  VectorXi getClusters();
171 
175  Cma(vector<string> & alph, int seq_number);
176  virtual ~Cma();
177 };
178 
179 #else
180 #error Cma.h included twice
181 #endif // CMA_H
int pos1
Definition: Cma.h:66
STL namespace.
FILE * seq
Definition: rns.c:46
Definition: Cma.h:65
int pos2
Definition: Cma.h:67
static double getEntropy(double **E, double m, int l)
Definition: align.cxx:150
vector< vector< string > > VecVecType
Definition: Cma.h:60
vector< map< int, int > > MapVecType
Definition: Cma.h:58
xml element
double MI
Definition: Cma.h:68
#define epsilon
Definition: DI_protdist.cxx:17
vector< MapVecType > MapMatrixType
Definition: Cma.h:62
#define CONSTEXPR
Definition: cxxforward.h:108