ARB
iupac.cxx
Go to the documentation of this file.
1 // ================================================================ //
2 // //
3 // File : iupac.cxx //
4 // Purpose : //
5 // //
6 // Institute of Microbiology (Technical University Munich) //
7 // http://www.arb-home.de/ //
8 // //
9 // ================================================================ //
10 
11 #include "iupac.h"
12 #include <cctype>
13 
14 #define awt_assert(bed) arb_assert(bed)
15 
16 #define IUPAC_EMPTY "" // changed from " " to "" (2009-03-19 -- ralf)
17 #define ILL_CODE char(26)
18 
19 using namespace iupac;
20 
21 namespace iupac {
22 #define _____________ {NULp,0}
23 
24  const Nuc_Group nuc_group[26][2] = {
25  { { "A", 1 }, { "A", 1 } }, // A
26  { { "CGT", 3 }, { "CGU", 3 } }, // B
27  { { "C", 1 }, { "C", 1 } }, // C
28  { { "AGT", 3 }, { "AGU", 3 } }, // D
29  { _____________, _____________ }, // E
30  { _____________, _____________ }, // F
31  { { "G", 1 }, { "G", 1 } }, // G
32  { { "ACT", 3 }, { "ACU", 3 } }, // H
33  { _____________, _____________ }, // I
34  { _____________, _____________ }, // J
35  { { "GT", 2 }, { "GU", 2 } }, // K
36  { _____________, _____________ }, // L
37  { { "AC", 2 }, { "AC", 2 } }, // M
38  { { "ACGT", 1 }, { "ACGU", 1 } }, // N
39  { _____________, _____________ }, // O
40  { _____________, _____________ }, // P
41  { _____________, _____________ }, // Q
42  { { "AG", 2 }, { "AG", 2 } }, // R
43  { { "CG", 2 }, { "CG", 2 } }, // S
44  { { "T", 1 }, { "U", 1 } }, // T
45  { { "T", 1 }, { "U", 1 } }, // U
46  { { "ACG", 3 }, { "ACG", 3 } }, // V
47  { { "AT", 2 }, { "AU", 2 } }, // W
48  { _____________, _____________ }, // X
49  { { "CT", 2 }, { "CU", 2 } }, // Y
50  { _____________, _____________ }, // Z
51 
52  // (In each single string the nucs have to be sorted alphabetically)
53  };
54 
55 #undef _____________
56 
57  static const char *aminoGroupMembers[AA_GROUP_COUNT] = {
58  // Note: first character has to be the "representative character" of the group
59  "XOU", // AA_GROUP_NONE
60  "AGPST", // AA_GROUP_ALPHA
61  "DBENQZ", // AA_GROUP_BETA
62  "HKR", // AA_GROUP_GAMMA
63  "IJLMV", // AA_GROUP_DELTA
64  "FWY", // AA_GROUP_EPSILON
65  "C", // AA_GROUP_ZETA
66  };
67 
69 
70  class Setup { // setup static data in this module
71  void setup_amino_group() {
72  for (int c = 0; c<26; ++c) amino_group[c] = AA_GROUP_ILLEGAL;
73  for (Amino_Group g = AA_GROUP_NONE; g<AA_GROUP_COUNT; g = Amino_Group(g+1)) {
74  const char *members = aminoGroupMembers[g];
75  for (int p = 0; members[p]; ++p) {
76  int c = members[p];
77  awt_assert(c >= 'A' && c <= 'Z');
78  amino_group[c-'A'] = g;
79  }
80  }
81  }
82  public:
83  Setup() { setup_amino_group(); }
84  };
85  static Setup perform;
86 
87 };
88 
90  aa = toupper(aa);
91  if (aa<'A' || aa>'Z') { return AA_GROUP_ILLEGAL; }
92  return amino_group[aa-'A'];
93 }
94 
96  if (ag>AA_GROUP_NONE && ag<AA_GROUP_ILLEGAL) {
97  return aminoGroupMembers[ag][0];
98  }
99  return '?';
100 }
101 
102 static char IUPAC_add[26][26]; // uses T
103 static int IUPAC_add_initialized = 0;
104 
105 static void initialize_IUPAC_add() {
106  int c1, c2;
107 
108  for (c1=0; c1<26; c1++) {
109  const char *decoded1 = nuc_group[c1][0].members;
110 
111  if (!decoded1) {
112  for (c2=0; c2<=c1; c2++) {
113  IUPAC_add[c1][c2] = ILL_CODE;
114  IUPAC_add[c2][c1] = ILL_CODE;
115  }
116  }
117  else {
118  IUPAC_add[c1][c1] = char(c1); // add char to same char
119 
120  for (c2=0; c2<c1; c2++) {
121  if (strchr(decoded1, 'A'+c2)) { // char is already contained in this IUPAC
122  IUPAC_add[c1][c2] = char(c1);
123  }
124  else {
125  const char *decoded2 = nuc_group[c2][0].members;
126 
127  if (!decoded2) { // char is illegal
128  IUPAC_add[c1][c2] = ILL_CODE;
129  }
130  else {
131 #define MAX_MIXED 5
132  char mixed[MAX_MIXED];
133  char *mp = mixed;
134  const char *d1 = decoded1;
135  const char *d2 = decoded2;
136 
137  while (1) {
138  char z1 = *d1;
139  char z2 = *d2;
140 
141  if (!z1 && !z2) break;
142 
143  if (z1==z2) {
144  *mp++ = z1;
145  d1++;
146  d2++;
147  }
148  else if (!z2 || (z1 && z1<z2)) {
149  *mp++ = z1;
150  d1++;
151  }
152  else {
153  awt_assert(!z1 || (z2 && z2<z1));
154  *mp++ = z2;
155  d2++;
156  }
157  }
158 
159  awt_assert((mp-mixed)<MAX_MIXED);
160  *mp++ = 0;
161 
162 #if !defined(NDEBUG) && 0
163  printf("Mix '%s' + '%s' = '%s'\n", decoded1, decoded2, mixed);
164 #endif
165 
166  int c3;
167 
168  for (c3=0; c3<26; c3++) {
169  if (nuc_group[c3][0].members && strcmp(mixed, nuc_group[c3][0].members)==0) {
170  IUPAC_add[c1][c2] = char(c3);
171  break;
172  }
173  }
174 
175  if (c3>=26) {
176  IUPAC_add[c1][c2] = 0;
177  }
178  }
179  }
180 
181  if (IUPAC_add[c1][c2]==('U'-'A')) {
182  IUPAC_add[c1][c2] = 'T'-'A';
183  }
184 
185  IUPAC_add[c2][c1] = IUPAC_add[c1][c2];
186  }
187  }
188 
189  }
190 }
191 
192 char iupac::encode(const char bases[], GB_alignment_type ali) {
193  if (!IUPAC_add_initialized) {
195  IUPAC_add_initialized = 1;
196  }
197 
198  if (ali==GB_AT_AA) {
199  return '?';
200  }
201 
202  int i = 0;
203  unsigned char c1;
204 
205  while (1) {
206  c1 = bases[i++];
207  if (!c1) return '-';
208  if (isalpha(c1)) break;
209  }
210 
211  c1 = toupper(c1)-'A';
212  unsigned char c = IUPAC_add[c1][c1];
213 
214  while (c!=ILL_CODE && bases[i]) {
215  if (isalpha(bases[i])) {
216  int c2 = toupper(bases[i])-'A';
217  c = IUPAC_add[c][c2];
218  }
219  i++;
220  }
221 
222  if (c==ILL_CODE) {
223  return '-';
224  }
225 
226  c += 'A';
227  if (c=='T' && ali==GB_AT_RNA) c = 'U';
228  return c;
229 }
230 
231 char iupac::combine(char c1, char c2, GB_alignment_type ali) {
232  char buffer[3];
233  buffer[0] = c1;
234  buffer[1] = c2;
235  buffer[2] = 0;
236  return iupac::encode(buffer, ali);
237 }
238 
239 const char* iupac::decode(char iupac, GB_alignment_type ali, bool decode_amino_iupac_groups) {
240  if (!isalpha(iupac)) {
241  return IUPAC_EMPTY;
242  }
243 
244  if (ali==GB_AT_AA) {
245  if (decode_amino_iupac_groups) {
247  if (group == AA_GROUP_NONE) {
248  return "";
249  }
250  else {
251  awt_assert(group >= AA_GROUP_NONE && group<AA_GROUP_COUNT);
252  return aminoGroupMembers[group];
253  }
254  }
255  return "?";
256  }
257 
258  const char *decoded = nuc_group[toupper(iupac)-'A'][ali==GB_AT_RNA ? 1 : 0].members;
259 
260  return decoded ? decoded : IUPAC_EMPTY;
261 }
262 
263 // --------------------------------------------------------------------------------
264 
265 #ifdef UNIT_TESTS
266 
267 #include <test_unit.h>
268 
269 void TEST_amino_groups() {
270  TEST_EXPECT_EQUAL(decode('x', GB_AT_AA, true), "");
271  TEST_EXPECT_EQUAL(decode('A', GB_AT_AA, true), "AGPST");
272 
273  // each character (A-Z) shall be member of a group:
274  for (char c = 'A'; c <= 'Z'; ++c) {
279 
280  {
281  const char * const members = aminoGroupMembers[group];
282  const char * const found = strchr(members, c);
283  TEST_EXPECT_EQUAL(found ? found[0] : 0, c); // char has to be a member of its group
284  }
285 
286  if (group != AA_GROUP_NONE) {
287  const char * const decoded = decode(c, GB_AT_AA, true);
288  const char * const found = strchr(decoded, c);
289 
290  TEST_EXPECT_EQUAL(found ? found[0] : 0, c); // check char is
291  }
292  }
293 
294  // no character may be member of two groups
295  for (Amino_Group group = AA_GROUP_NONE; group < AA_GROUP_COUNT; group = Amino_Group(group+1)) {
296  const char * const member = aminoGroupMembers[group];
297  for (int pos = 0; member[pos]; ++pos) {
298  Amino_Group groupOfChar = get_amino_group_for(member[pos]);
299  TEST_EXPECT_EQUAL(groupOfChar, group);
300  }
301  }
302 }
303 
304 void TEST_nuc_groups() {
305  for (int base = 0; base<26; base++) {
306  TEST_EXPECT_EQUAL(nuc_group[base][0].count, nuc_group[base][1].count);
307  for (int alitype = 0; alitype<2; alitype++) {
308  const Nuc_Group& group = nuc_group[base][alitype];
309  if (group.members) {
310  if ((base+'A') == 'N') {
311  TEST_EXPECT_EQUAL__BROKEN(group.count, strlen(group.members), 1);
312  // @@@ fails because count is 1 for "ACGT" [N]
313  // maybe be expected by resolve_IUPAC_target_string (fix this first)
314  TEST_EXPECT_EQUAL(group.count, 1U); // fixture for behavior
315  }
316  else {
317  TEST_EXPECT_EQUAL(group.count, strlen(group.members));
318  }
319 
320  for (size_t pos = 1; pos<group.count; ++pos) {
321  TEST_EXPECT_LESS(group.members[pos-1], group.members[pos]);
322  }
323  }
324  else {
325  TEST_REJECT(group.count);
326  }
327  }
328  }
329 }
330 
331 #endif // UNIT_TESTS
static Amino_Group amino_group[26]
Definition: iupac.cxx:68
const char *const members
Definition: iupac.h:47
Definition: iupac.cxx:21
#define awt_assert(bed)
Definition: iupac.cxx:14
char get_amino_consensus_char(Amino_Group ag)
Definition: iupac.cxx:95
static char IUPAC_add[26][26]
Definition: iupac.cxx:102
#define IUPAC_EMPTY
Definition: iupac.cxx:16
char buffer[MESSAGE_BUFFERSIZE]
Definition: seq_search.cxx:34
char combine(char c1, char c2, GB_alignment_type ali)
Definition: iupac.cxx:231
const char * decode(char iupac, GB_alignment_type aliType, bool decode_amino_iupac_groups)
Definition: iupac.cxx:239
#define _____________
Definition: iupac.cxx:22
#define ILL_CODE
Definition: iupac.cxx:17
static void initialize_IUPAC_add()
Definition: iupac.cxx:105
#define TEST_EXPECT_EQUAL__BROKEN(expr, want, got)
Definition: test_unit.h:1284
static int group[MAXN+1]
Definition: ClustalV.cxx:65
Amino_Group
Definition: iupac.h:26
static int IUPAC_add_initialized
Definition: iupac.cxx:103
#define TEST_REJECT(cond)
Definition: test_unit.h:1315
const Nuc_Group nuc_group[26][2]
Definition: iupac.cxx:24
static Setup perform
Definition: iupac.cxx:85
#define TEST_EXPECT_LESS(val, ref)
Definition: test_unit.h:1293
GB_alignment_type
Definition: arbdb_base.h:61
Amino_Group get_amino_group_for(char aa)
Definition: iupac.cxx:89
size_t const count
Definition: iupac.h:48
#define TEST_EXPECT_IN_RANGE(val, lower, higher)
Definition: test_unit.h:1298
char encode(const char bases[], GB_alignment_type aliType)
Definition: iupac.cxx:192
#define MAX_MIXED
static const char * aminoGroupMembers[AA_GROUP_COUNT]
Definition: iupac.cxx:57
#define TEST_EXPECT_DIFFERENT(expr, want)
Definition: test_unit.h:1290
#define TEST_EXPECT_EQUAL(expr, want)
Definition: test_unit.h:1283
static Score ** U
Definition: align.cxx:67