ARB
arb_primer.cxx
Go to the documentation of this file.
1 // =============================================================== //
2 // //
3 // File : arb_primer.cxx //
4 // Purpose : //
5 // //
6 // Institute of Microbiology (Technical University Munich) //
7 // http://www.arb-home.de/ //
8 // //
9 // =============================================================== //
10 
11 #include <arbdbt.h>
12 #include <arb_strarray.h>
13 
14 #define ADD_LEN 10
15 #define PRM_BUFFERSIZE 256
16 
17 
18 struct arb_prm_struct : virtual Noncopyable {
20  int al_len;
21  int max_name;
24  const char *source;
25  int prmanz;
26  int prmlen;
27  int prmsmin;
28  char **data;
29  int sp_count;
30  int key_cnt;
32  int reduce;
33  FILE *out;
34  char *outname;
35 
37  : al_len(0),
38  max_name(0),
39  gb_main(NULp),
40  source(NULp),
41  prmanz(0),
42  prmlen(0),
43  prmsmin(0),
44  data(NULp),
45  sp_count(0),
46  key_cnt(0),
47  one_key_cnt(0),
48  reduce(0),
49  out(NULp),
50  outname(NULp)
51  {
52  memset(buffer, 0, sizeof(buffer));
53  }
55  if (data) {
56  for (int i = 0; i<sp_count; ++i) free(data[i]);
57  free(data);
58  }
59  free(outname);
60  }
61 
62 };
64 
65 inline int getNumFromStdin() {
66  // returns entered number (or 0)
67  int i = 0;
68  if (fgets(aprm.buffer, PRM_BUFFERSIZE, stdin)) {
69  i = atoi(aprm.buffer);
70  }
71  return i;
72 }
73 
75  printf(" Please select an Alignment:\n");
76  int i;
77  for (i=1; aprm.alignment_names[i-1]; ++i) {
78  printf("%i: %s\n", i, aprm.alignment_names[i-1]);
79  }
80  aprm.max_name = i;
81 
83  i = getNumFromStdin();
84  if ((i<1) || (i>=aprm.max_name)) {
85  error = GBS_global_string("selection %i out of range", i);
86  }
87  else {
88  aprm.source = aprm.alignment_names[i-1];
89 
90  printf("This module will search for primers for all positions.\n"
91  " The best result is one primer for all (marked) taxa , the worst case\n"
92  " are n primers for n taxa.\n"
93  " Please specify the maximum number of primers:\n"
94  );
95  aprm.prmanz = getNumFromStdin();
96 
97  printf("Select minimum length of a primer, the maximum will be (minimum + %i)\n", ADD_LEN);
98  i = getNumFromStdin();
99  if ((i<4) || (i>30)) {
100  error = GBS_global_string("selection %i out of range", i);
101  }
102  else {
103  aprm.prmlen = i;
104 
105  printf("There may be short sequences or/and deletes in full sequences\n"
106  " So a primer normally does not match all sequences\n"
107  " Specify minimum percentage of species (0-100 %%):\n");
108  i = getNumFromStdin();
109  if ((i<1) || (i>100)) {
110  error = GBS_global_string("selection %i out of range", i);
111  }
112  else {
113  aprm.prmsmin = i;
114 
115  printf("Write output to file (enter \"\" to write to screen)\n");
116  if (fgets(aprm.buffer, PRM_BUFFERSIZE, stdin)) {
117  char *lf = strchr(aprm.buffer, '\n');
118  if (lf) lf[0] = 0; // remove linefeed from filename
119  aprm.outname = ARB_strdup(aprm.buffer);
120  }
121  else {
122  aprm.outname = ARB_strdup("");
123  }
124  }
125  }
126  }
127  return error;
128 }
129 
130 static GB_ERROR arb_prm_read(int /* prmanz */) {
131  GBDATA *gb_presets = GBT_get_presets(aprm.gb_main);
132  {
133  GBDATA *gb_source = GB_find_string(gb_presets, "alignment_name", aprm.source, GB_IGNORE_CASE, SEARCH_GRANDCHILD);
134  GBDATA *gb_len = GB_brother(gb_source, "alignment_len");
135  aprm.al_len = GB_read_int(gb_len);
136  }
137 
138  int sp_count = GBT_count_marked_species(aprm.gb_main);
139  ARB_calloc(aprm.data, sp_count);
140 
141  sp_count = 0;
142  for (GBDATA *gb_species = GBT_first_marked_species(aprm.gb_main);
143  gb_species;
144  gb_species = GBT_next_marked_species(gb_species))
145  {
146  GBDATA *gb_source = GB_entry(gb_species, aprm.source);
147  if (gb_source) {
148  GBDATA *gb_source_data = GB_entry(gb_source, "data");
149  if (gb_source_data) {
150  const char *hdata = GB_read_char_pntr(gb_source_data);
151 
152  if (!hdata) {
153  GB_print_error();
154  }
155  else {
156  char *data = ARB_calloc<char>(aprm.al_len+1);
157  aprm.data[sp_count ++] = data;
158 
159  if (sp_count % 50 == 0) printf("Reading taxa %i\n", sp_count);
160 
161  int size = GB_read_string_count(gb_source_data);
162  int i;
163  for (i=0; i<size; i++) { // LOOP_VECTORIZED[!<720]
164  char c = hdata[i];
165  if ((c>='a') && (c<='z')) {
166  data[i] = c-'a'+'A';
167  }
168  else {
169  data[i] = c;
170  }
171  }
172  for (; i<aprm.al_len; i++) {
173  data[i] = '.';
174  }
175  data[i] = 0;
176  }
177  }
178  }
179  }
180  printf("%i taxa read\n", sp_count);
181  aprm.sp_count = sp_count;
182  if (sp_count == 0) {
183  return "No marked taxa found";
184  }
185  return NULp;
186 }
187 
188 static void arb_count_keys(const char * /* key */, long val, void *) {
189  if (val >1) {
190  aprm.key_cnt++;
191  }
192  else {
193  aprm.one_key_cnt++;
194  }
195 }
196 
197 static void arb_print_primer(const char *key, long val, void *) {
198  if (val > 1) {
199  int gc = 0;
200  const char *p;
201  for (p = key; *p; p++) {
202  if (*p == 'G' || *p == 'C') gc++;
203  }
204  fprintf(aprm.out, " %s matching %4li taxa GC = %3i%%\n",
205  key, val, 100*gc/(int)strlen(key));
206  }
207 }
208 
209 #define is_base(c) (((c>='a') && (c<='z')) || ((c>='A')&&(c<='Z')))
210 
211 static int primer_print(char *dest, char * source, int size) {
212  char c;
213  c = *(source++);
214  if (!is_base(c)) return 1;
215  while (size) {
216  while (!is_base(c)) {
217  c = *(source++);
218  if (!c) return 1;
219  }
220  if (c == 'N' || c == 'n') return 1;
221  *(dest++) = c;
222  size--;
223  if (!c) return 1;
224  c = 0;
225  }
226  *dest = 0;
227  return 0;
228 }
229 
230 
231 static long arb_reduce_primer_len(const char *key, long val, void *cl_hash) {
232  GB_HASH* hash = (GB_HASH*)cl_hash;
233 
234  const int BUFLEN = 256;
235  char buffer[BUFLEN];
236 
237  int size = strlen(key)-aprm.reduce;
238  arb_assert(aprm.reduce>=0);
239  arb_assert(size<BUFLEN);
240 
241  memcpy(buffer, key, size);
242  buffer[size] = 0;
243 
244  val += GBS_read_hash(hash, buffer);
245  GBS_write_hash(hash, buffer, val);
246 
247  return val;
248 }
249 
250 static void arb_prm_primer(int /* prmanz */) {
251  GB_HASH *mhash;
252  int sp;
253  char *buffer;
254  int pos;
255  int prmlen;
256  int pspecies;
257  int cutoff_cnt;
258  int *best_primer_cnt;
259  int *best_primer_new;
260  int *best_primer_swap;
261 
262  prmlen = aprm.prmlen + ADD_LEN + 1;
263 
264  ARB_calloc(buffer, prmlen+1);
265  ARB_calloc(best_primer_cnt, prmlen+1);
266  ARB_calloc(best_primer_new, prmlen+1);
267 
268  for (pos = 0; pos < aprm.al_len; pos++) {
269  prmlen = aprm.prmlen + ADD_LEN;
270  mhash = GBS_create_hash(1024, GB_MIND_CASE);
271  pspecies = 0;
272  if (pos % 50 == 0) printf("Pos. %i (%i)\n", pos, aprm.al_len);
273  cutoff_cnt = aprm.prmanz+1;
274  for (sp = 0; sp < aprm.sp_count; sp++) { // build initial hash table
275  if (!primer_print(buffer, aprm.data[sp] + pos, prmlen)) {
276  GBS_incr_hash(mhash, buffer);
277  pspecies++;
278  }
279  }
280  if (pspecies*100 >= aprm.prmsmin * aprm.sp_count) { // reduce primer length
281  for (; prmlen >= aprm.prmlen; prmlen-=aprm.reduce) {
283 
284  aprm.key_cnt = 0;
285  aprm.one_key_cnt = 0;
287  if ((aprm.key_cnt + aprm.one_key_cnt < cutoff_cnt) &&
288  // (aprm.key_cnt > aprm.one_key_cnt) &&
289  (aprm.key_cnt<best_primer_cnt[prmlen+1])) {
290  fprintf(aprm.out, "%3i primer found len %3i(of %4i taxa) for position %i\n", aprm.key_cnt, prmlen, pspecies, pos);
292  fprintf(aprm.out, "\n\n");
293  cutoff_cnt = aprm.key_cnt;
294  }
295  best_primer_new[prmlen] = aprm.key_cnt;
296  aprm.reduce = 1;
297  while (aprm.key_cnt > aprm.prmanz*4) {
298  aprm.key_cnt/=4;
299  aprm.reduce++;
300  }
302  GBS_free_hash(mhash);
303  mhash = hash;
304  }
305  }
306  else {
307  for (; prmlen>0; prmlen--) best_primer_new[prmlen] = aprm.prmanz+1; // LOOP_VECTORIZED // tested down to gcc 5.5.0 (may fail on older gcc versions)
308  }
309  GBS_free_hash(mhash);
310  best_primer_swap = best_primer_new;
311  best_primer_new = best_primer_cnt;
312  best_primer_cnt = best_primer_swap;
313  mhash = NULp;
314  }
315 
316  free(best_primer_new);
317  free(best_primer_cnt);
318  free(buffer);
319 }
320 
321 int ARB_main(int argc, char *argv[]) {
322  const char *path = NULp;
323 
324  while (argc >= 2) {
325  if (strcmp(argv[1], "--help") == 0) {
326  fprintf(stderr,
327  "Usage: arb_primer [dbname]\n"
328  "Searches sequencing primers\n");
329  return EXIT_FAILURE;
330  }
331  path = argv[1];
332  argv++; argc--;
333  }
334 
335  if (!path) path = ":";
336 
337  GB_ERROR error = NULp;
338  GB_shell shell;
339  aprm.gb_main = GB_open(path, "r");
340  if (!aprm.gb_main) {
341  error = GBS_global_string("Can't open db '%s' (Reason: %s)", path, GB_await_error());
342  }
343  else {
347 
348  error = arb_prm_menu();
349 
350  if (!error) {
352  error = arb_prm_read(aprm.prmanz);
353  if (!error) {
355  if (strlen(aprm.outname)) {
356  aprm.out = fopen(aprm.outname, "w");
357  if (!aprm.out) {
358  error = GB_IO_error("writing", aprm.outname);
359  }
360  else {
361  arb_prm_primer(aprm.prmanz);
362  fclose(aprm.out);
363  }
364  }
365  else {
366  aprm.out = stdout;
367  arb_prm_primer(aprm.prmanz);
368  }
369  }
370  else {
372  }
373  }
374  GB_close(aprm.gb_main);
375  }
376 
377  if (error) {
378  fprintf(stderr, "Error in arb_primer: %s\n", error);
379  return EXIT_FAILURE;
380  }
381  return EXIT_SUCCESS;
382 }
GB_ERROR GB_begin_transaction(GBDATA *gbd)
Definition: arbdb.cxx:2528
GBDATA * gb_main
Definition: arb_primer.cxx:22
#define arb_assert(cond)
Definition: arb_assert.h:245
static void arb_count_keys(const char *, long val, void *)
Definition: arb_primer.cxx:188
const char * GB_ERROR
Definition: arb_core.h:25
GBDATA * GB_open(const char *path, const char *opent)
Definition: ad_load.cxx:1363
GB_ERROR GB_commit_transaction(GBDATA *gbd)
Definition: arbdb.cxx:2551
#define PRM_BUFFERSIZE
Definition: arb_primer.cxx:15
GBDATA * GBT_first_marked_species(GBDATA *gb_main)
Definition: aditem.cxx:113
long GB_read_int(GBDATA *gbd)
Definition: arbdb.cxx:729
#define is_base(c)
Definition: arb_primer.cxx:209
long GBS_write_hash(GB_HASH *hs, const char *key, long val)
Definition: adhash.cxx:454
long GBS_incr_hash(GB_HASH *hs, const char *key)
Definition: adhash.cxx:467
static GB_ERROR arb_prm_menu()
Definition: arb_primer.cxx:74
void GBT_get_alignment_names(ConstStrArray &names, GBDATA *gbd)
Definition: adali.cxx:317
char buffer[PRM_BUFFERSIZE]
Definition: arb_primer.cxx:23
GB_ERROR GB_IO_error(const char *action, const char *filename)
Definition: arb_msg.cxx:285
char * ARB_strdup(const char *str)
Definition: arb_string.h:27
const char * GBS_global_string(const char *templat,...)
Definition: arb_msg.cxx:203
void GBS_free_hash(GB_HASH *hs)
Definition: adhash.cxx:538
#define EXIT_SUCCESS
Definition: arb_a2ps.c:154
int getNumFromStdin()
Definition: arb_primer.cxx:65
char buffer[MESSAGE_BUFFERSIZE]
Definition: seq_search.cxx:34
size_t GB_read_string_count(GBDATA *gbd)
Definition: arbdb.cxx:916
GB_ERROR GB_await_error()
Definition: arb_msg.cxx:342
static void arb_prm_primer(int)
Definition: arb_primer.cxx:250
void GBS_hash_do_const_loop(const GB_HASH *hs, gb_hash_const_loop_type func, void *client_data)
Definition: adhash.cxx:559
static void error(const char *msg)
Definition: mkptypes.cxx:96
GB_ERROR GB_abort_transaction(GBDATA *gbd)
Definition: arbdb.cxx:2539
GBDATA * GBT_next_marked_species(GBDATA *gb_species)
Definition: aditem.cxx:116
static int primer_print(char *dest, char *source, int size)
Definition: arb_primer.cxx:211
static long arb_reduce_primer_len(const char *key, long val, void *cl_hash)
Definition: arb_primer.cxx:231
GBDATA * GB_brother(GBDATA *entry, const char *key)
Definition: adquery.cxx:361
static void arb_print_primer(const char *key, long val, void *)
Definition: arb_primer.cxx:197
#define EXIT_FAILURE
Definition: arb_a2ps.c:157
void GBS_hash_do_loop(GB_HASH *hs, gb_hash_loop_type func, void *client_data)
Definition: adhash.cxx:545
long GBT_count_marked_species(GBDATA *gb_main)
Definition: aditem.cxx:372
GB_ERROR GB_print_error()
Definition: arb_msg.cxx:324
int ARB_main(int argc, char *argv[])
Definition: arb_primer.cxx:321
TYPE * ARB_calloc(size_t nelem)
Definition: arb_mem.h:81
ConstStrArray alignment_names
Definition: arb_primer.cxx:19
GBDATA * GB_find_string(GBDATA *gbd, const char *key, const char *str, GB_CASE case_sens, GB_SEARCH_TYPE gbs)
Definition: adquery.cxx:302
const char * source
Definition: arb_primer.cxx:24
#define NULp
Definition: cxxforward.h:116
static GB_ERROR arb_prm_read(int)
Definition: arb_primer.cxx:130
static arb_prm_struct aprm
Definition: arb_primer.cxx:63
GB_CSTR GB_read_char_pntr(GBDATA *gbd)
Definition: arbdb.cxx:904
GBDATA * GBT_get_presets(GBDATA *gb_main)
Definition: adali.cxx:30
long GBS_read_hash(const GB_HASH *hs, const char *key)
Definition: adhash.cxx:392
GBDATA * GB_entry(GBDATA *father, const char *key)
Definition: adquery.cxx:334
void GB_close(GBDATA *gbd)
Definition: arbdb.cxx:655
#define ADD_LEN
Definition: arb_primer.cxx:14
GB_HASH * GBS_create_hash(long estimated_elements, GB_CASE case_sens)
Definition: adhash.cxx:253