ARB
PT_new_design.cxx
Go to the documentation of this file.
1 // =============================================================== //
2 // //
3 // File : PT_new_design.cxx //
4 // Purpose : //
5 // //
6 // Institute of Microbiology (Technical University Munich) //
7 // http://www.arb-home.de/ //
8 // //
9 // =============================================================== //
10 
11 
12 #include "pt_split.h"
13 #include <PT_server_prototypes.h>
14 
15 #include <struct_man.h>
16 #include "probe_tree.h"
17 #include "PT_prefixIter.h"
18 
19 #include <arb_str.h>
20 #include <arb_defs.h>
21 #include <arb_sort.h>
22 
23 #include <arb_strbuf.h>
24 #include <arb_strarray.h>
25 
26 #include <climits>
27 #include <algorithm>
28 #include <map>
29 #include <arb_progress.h>
30 
31 #if defined(DEBUG)
32 // # define DUMP_DESIGNED_PROBES
33 // # define DUMP_OLIGO_MATCHING
34 #endif
35 
36 #define MIN_DESIGN_PROBE_LENGTH DOMAIN_MIN_LENGTH
37 int Complement::calc_complement(int base) {
38  switch (base) {
39  case PT_A: return PT_T;
40  case PT_C: return PT_G;
41  case PT_G: return PT_C;
42  case PT_T: return PT_A;
43  default: return base;
44  }
45 }
46 
47 // overloaded functions to avoid problems with type-punning:
48 inline void aisc_link(dll_public *dll, PT_tprobes *tprobe) { aisc_link(reinterpret_cast<dllpublic_ext*>(dll), reinterpret_cast<dllheader_ext*>(tprobe)); }
49 
50 int pt_init_bond_matrix(PT_local *THIS) {
51  THIS->bond[0].val = 0.0;
52  THIS->bond[1].val = 0.0;
53  THIS->bond[2].val = 0.5;
54  THIS->bond[3].val = 1.1;
55  THIS->bond[4].val = 0.0;
56  THIS->bond[5].val = 0.0;
57  THIS->bond[6].val = 1.5;
58  THIS->bond[7].val = 0.0;
59  THIS->bond[8].val = 0.5;
60  THIS->bond[9].val = 1.5;
61  THIS->bond[10].val = 0.4;
62  THIS->bond[11].val = 0.9;
63  THIS->bond[12].val = 1.1;
64  THIS->bond[13].val = 0.0;
65  THIS->bond[14].val = 0.9;
66  THIS->bond[15].val = 0.0;
67 
68  return 0;
69 }
70 
71 struct dt_bondssum {
72  double dt; // sum of mismatches
73  double sum_bonds; // sum of bonds of longest non mismatch string
74 
75  dt_bondssum(double dt_, double sum_bonds_) : dt(dt_), sum_bonds(sum_bonds_) {}
76 };
77 
78 class Oligo {
79  const char *data;
80  int length;
81 
82 public:
83  Oligo() : data(NULp), length(0) {} // empty oligo
84  Oligo(const char *data_, int length_) : data(data_), length(length_) {}
85 
86  char at(int offset) const {
87  pt_assert(offset >= 0 && offset<length);
88  return data[offset];
89  }
90  int size() const { return length; }
91 
92  Oligo suffix(int skip) const {
93  return skip <= length ? Oligo(data+skip, length-skip) : Oligo();
94  }
95 
96 #if defined(DUMP_OLIGO_MATCHING)
97  void dump(FILE *out) const {
98  char *readable = readable_probe(data, length, 'T');
99  fputs(readable, out);
100  free(readable);
101  }
102 #endif
103 };
104 
106  Oligo oligo;
107  int matched; // how many positions matched
108  int lastSplit;
109 
110  bool domain_found;
111  double maxDomainBondSum;
112 
113  dt_bondssum linkage;
114 
115  MatchingOligo(const MatchingOligo& other, double strength) // expand by 1 (with a split)
116  : oligo(other.oligo),
117  matched(other.matched+1),
118  domain_found(other.domain_found),
119  maxDomainBondSum(other.maxDomainBondSum),
120  linkage(other.linkage)
121  {
122  pt_assert(other.dangling());
123  pt_assert(strength<0.0);
124 
125  lastSplit = matched-1;
126  linkage.dt -= strength;
127  linkage.sum_bonds = 0.0;
128 
129  pt_assert(domainLength() == 0);
130  }
131 
132  MatchingOligo(const MatchingOligo& other, double strength, double max_bond) // expand by 1 (binding)
133  : oligo(other.oligo),
134  matched(other.matched+1),
135  domain_found(other.domain_found),
136  maxDomainBondSum(other.maxDomainBondSum),
137  linkage(other.linkage)
138  {
139  pt_assert(other.dangling());
140  pt_assert(strength >= 0.0);
141 
142  lastSplit = other.lastSplit;
143  linkage.dt += strength;
144  linkage.sum_bonds += max_bond-strength;
145 
146  pt_assert(linkage.sum_bonds >= 0.0);
147 
148  if (domainLength() >= DOMAIN_MIN_LENGTH) {
149  domain_found = true;
150  maxDomainBondSum = std::max(maxDomainBondSum, linkage.sum_bonds);
151  }
152  }
153 
154  void accept_rest_dangling() {
155  pt_assert(dangling());
156  lastSplit = matched+1;
157  matched = oligo.size();
158  pt_assert(!dangling());
159  }
160 
161  void optimal_bind_rest(const Splits& splits) { // @@@ slow -> optimize
162  pt_assert(dangling());
163  while (dangling()) {
164  char pc = dangling_char();
165  double strength = splits.check(pc, pc); // @@@ always 0.0?
166  UNCOVERED();
167  pt_assert(strength >= 0.0);
168 
169  double bondmax = splits.get_max_bond(pc);
170 
171  matched++;
172  linkage.dt += strength;
173  linkage.sum_bonds += bondmax-strength;
174  }
175  if (domainLength() >= DOMAIN_MIN_LENGTH) {
176  domain_found = true;
177  maxDomainBondSum = std::max(maxDomainBondSum, linkage.sum_bonds);
178  }
179  pt_assert(!dangling());
180  }
181 
182 public:
183  explicit MatchingOligo(const Oligo& oligo_)
184  : oligo(oligo_),
185  matched(0),
186  lastSplit(-1),
187  domain_found(false),
188  maxDomainBondSum(-1),
189  linkage(0.0, 0.0)
190  {}
191 
192  int dangling() const {
193  int dangle_count = oligo.size()-matched;
194  pt_assert(dangle_count >= 0);
195  return dangle_count;
196  }
197 
198  char dangling_char() const {
199  pt_assert(dangling()>0);
200  return oligo.at(matched);
201  }
202 
203  MatchingOligo bind_against(char c, const Splits& splits) const {
205 
206  char pc = dangling_char();
207  double strength;
208 
209  if (c == PT_N) {
210  // we are checking outgroup-matches here => assume strongest possible bind versus N
211  strength = -100.0;
212  for (int base = PT_A; base < PT_BASES; ++base) {
213  strength = std::max(strength, splits.check(pc, base));
214  }
215  pt_assert(strength>-100.0);
216  }
217  else {
218  strength = splits.check(pc, c);
219  }
220 
221  return strength<0.0
222  ? MatchingOligo(*this, strength)
223  : MatchingOligo(*this, strength, splits.get_max_bond(pc));
224  }
225 
227  MatchingOligo accepted(*this);
228  accepted.accept_rest_dangling();
229  return accepted;
230  }
231 
232  int domainLength() const { return matched-(lastSplit+1); }
233 
234  bool domainSeen() const { return domain_found; }
235  bool domainPossible() const {
236  pt_assert(!domainSeen()); // you are testing w/o need
237  return (domainLength()+dangling()) >= DOMAIN_MIN_LENGTH;
238  }
239 
240  bool completely_bound() const { return !dangling(); }
241 
242  int calc_centigrade_pos(const PT_tprobes *tprobe, const PT_pdc *const pdc) const {
245 
246  double ndt = (linkage.dt * pdc->dt + (tprobe->sum_bonds - maxDomainBondSum)*pdc->dte) / tprobe->seq_len;
247  int pos = (int)ndt;
248 
249  pt_assert(pos >= 0);
250  return pos;
251  }
252 
253  bool centigrade_pos_out_of_reach(const PT_tprobes *tprobe, const PT_pdc *const pdc, const Splits& splits) const {
254  MatchingOligo optimum(*this);
255  optimum.optimal_bind_rest(splits);
256 
257  if (!optimum.domainSeen()) return true; // no domain -> no centigrade position
258 
259  int centpos = optimum.calc_centigrade_pos(tprobe, pdc);
260  return centpos >= PERC_SIZE;
261  }
262 
263  const Oligo& get_oligo() const { return oligo; }
264 
265 #if defined(DUMP_OLIGO_MATCHING)
266  void dump(FILE *out) const {
267  fputs("oligo='", out);
268  oligo.dump(out);
269 
270  const char *domainState = "impossible";
271  if (domainSeen()) domainState = "seen";
272  else if (domainPossible()) domainState = "possible";
273 
274  fprintf(out, "' matched=%2i lastSplit=%2i domainLength()=%2i dangling()=%2i domainState=%s maxDomainBondSum=%f dt=%f sum_bonds=%f\n",
275  matched, lastSplit, domainLength(), dangling(), domainState, maxDomainBondSum, linkage.dt, linkage.sum_bonds);
276  }
277 #endif
278 };
279 
280 static int ptnd_compare_quality(const void *vtp1, const void *vtp2, void *) {
281  const PT_tprobes *tp1 = (const PT_tprobes*)vtp1;
282  const PT_tprobes *tp2 = (const PT_tprobes*)vtp2;
283 
284  int cmp = double_cmp(tp2->quality, tp1->quality); // high quality first
285  if (!cmp) {
286  cmp = tp1->apos - tp2->apos; // low abs.pos first
287  if (!cmp) cmp = strcmp(tp1->sequence, tp2->sequence); // alphabetically by probe
288  }
289  return cmp;
290 }
291 
292 static int ptnd_compare_sequence(const void *vtp1, const void *vtp2, void*) {
293  const PT_tprobes *tp1 = (const PT_tprobes*)vtp1;
294  const PT_tprobes *tp2 = (const PT_tprobes*)vtp2;
295 
296  return strcmp(tp1->sequence, tp2->sequence);
297 }
298 
302 };
303 
304 static void sort_tprobes_by(PT_pdc *pdc, ProbeSortMode mode) {
305  if (pdc->tprobes) {
306  int list_len = pdc->tprobes->get_count();
307  if (list_len > 1) {
308  PT_tprobes **my_list = ARB_calloc<PT_tprobes*>(list_len);
309  {
310  PT_tprobes *tprobe;
311  int i;
312 
313  for (i = 0, tprobe = pdc->tprobes; tprobe; i++, tprobe = tprobe->next) {
314  my_list[i] = tprobe;
315  }
316  }
317 
318  switch (mode) {
319  case PSM_QUALITY:
320  GB_sort((void **)my_list, 0, list_len, ptnd_compare_quality, NULp);
321  break;
322 
323  case PSM_SEQUENCE:
324  GB_sort((void **)my_list, 0, list_len, ptnd_compare_sequence, NULp);
325  break;
326  }
327 
328  for (int i=0; i<list_len; i++) {
329  aisc_unlink(reinterpret_cast<dllheader_ext*>(my_list[i]));
330  aisc_link(&pdc->ptprobes, my_list[i]);
331  }
332 
333  free(my_list);
334  }
335  }
336 }
337 static void clip_tprobes(PT_pdc *pdc, int count) {
338  PT_tprobes *tprobe;
339  int i;
340 
341  for (i=0, tprobe = pdc->tprobes;
342  tprobe && i< count;
343  i++, tprobe = tprobe->next) ;
344 
345  if (tprobe) {
346  while (tprobe->next) {
347  destroy_PT_tprobes(tprobe->next);
348  }
349  }
350 }
351 static int pt_get_gc_content(char *probe) {
352  int gc = 0;
353  while (*probe) {
354  if (*probe == PT_G ||
355  *probe == PT_C) {
356  gc++;
357  }
358  probe ++;
359  }
360  return gc;
361 }
362 static double pt_get_temperature(const char *probe) {
363  int i;
364  double t = 0;
365  while ((i=*(probe++))) {
366  if (i == PT_G || i == PT_C) t+=4.0; else t += 2.0;
367  }
368  return t;
369 }
370 
371 static void tprobes_sumup_perc_and_calc_quality(PT_pdc *pdc) {
372  for (PT_tprobes *tprobe = pdc->tprobes; tprobe; tprobe = tprobe->next) {
373  // here tprobe->perc contains the outgroup matches,
374  // summarized for each single temperature decrease needed to assume a probe-hit
375  // (see summarize_centigrade_hits() and calc_centigrade_pos())
376 
377  // now summarize over list:
378  int sum = 0;
379  int i;
380  for (i=0; i<PERC_SIZE; ++i) {
381  sum += tprobe->perc[i];
382  tprobe->perc[i] = sum;
383  }
384 
385  // pt_assert(tprobe->perc[0] == tprobe->mishits); // OutgroupMatcher and count_mishits_for_matched do not agree!
386  // @@@ found one case where it differs by 1 (6176 in perc, 6177 in mishits). Can this be caused somehow by N or IUPAC in seq?
387 
388  int limit = 2*tprobe->mishits;
389  for (i=0; i<(PERC_SIZE-1); ++i) {
390  if (tprobe->perc[i]>limit) break;
391  }
392 
393  // quality calculation; documented in ../HELP_SOURCE/source/probedesignresult.hlp@decrease
394  tprobe->quality = ((double)tprobe->groupsize * i) + 1000.0/(1000.0 + tprobe->perc[i]);
395 
396 #if defined(DEBUG) && 0
397  fprintf(stderr, "quality=%3i (groupsize=%3i, mishits=%3i, limit=%3i, COL=%2i, hits[COL]=%i)\n",
398  int(tprobe->quality+0.5),
399  tprobe->groupsize,
400  tprobe->mishits,
401  limit,
402  i,
403  tprobe->perc[i]);
404 #endif
405  }
406 }
407 
408 static char hitgroup_idx2char(int idx) {
409  const int firstSmall = ALPHA_SIZE/2;
410  int c;
411  if (idx >= firstSmall) {
412  c = 'a'+(idx-firstSmall);
413  }
414  else {
415  c = 'A'+idx;
416  }
417  pt_assert(isalpha(c));
418  return c;
419 }
420 
421 inline int get_max_probelen(const PT_pdc *pdc) {
422  return pdc->max_probelen == -1 ? pdc->min_probelen : pdc->max_probelen;
423 }
424 
425 inline int shown_apos(const PT_tprobes *tprobe) { return info2bio(tprobe->apos); }
426 inline int shown_ecoli(const PT_tprobes *tprobe) { return PT_abs_2_ecoli_rel(tprobe->apos+1); }
427 inline int shown_qual(const PT_tprobes *tprobe) { return int(tprobe->quality+0.5); }
428 
430  int width[PERC_SIZE]; // of centigrade list columns
431  int maxprobelen;
432 
433  int apos_width;
434  int ecol_width;
435  int qual_width;
436  int grps_width;
437 
438  static inline void track_max(int& tracked, int val) { tracked = std::max(tracked, val); }
439 
440  void collect(const int *perc) { for (int i = 0; i<PERC_SIZE; ++i) width[i] = std::max(width[i], perc[i]); }
441  void collect(const PT_tprobes *tprobe) {
442  collect(tprobe->perc);
443  track_max(apos_width, shown_apos(tprobe));
444  track_max(ecol_width, shown_ecoli(tprobe));
445  track_max(qual_width, shown_qual(tprobe));
446  track_max(grps_width, shown_qual(tprobe));
447  track_max(maxprobelen, tprobe->seq_len);
448  }
449 
450  void clear() {
451  for (int i = 0; i<PERC_SIZE; ++i) width[i] = 0;
452 
453  apos_width = 0;
454  ecol_width = 0;
455  qual_width = 0;
456  grps_width = 0;
457 
458  maxprobelen = 0;
459  }
460 
461  static inline int width4num(int num) {
462  pt_assert(num >= 0);
463 
464  int digits = 0;
465  while (num) {
466  ++digits;
467  num /= 10;
468  }
469  return digits ? digits : 1;
470  }
471 
472 public:
473  PD_formatter() { clear(); }
474  PD_formatter(const PT_pdc *pdc) {
475  clear();
476 
477  // collect max values for output columns:
478  for (PT_tprobes *tprobe = pdc->tprobes; tprobe; tprobe = tprobe->next) collect(tprobe);
479 
480  // convert max.values to needed print-width:
481  for (int i = 0; i<PERC_SIZE; ++i) width[i] = width4num(width[i]);
482 
483  apos_width = std::max(width4num(apos_width), 2);
484  ecol_width = std::max(width4num(ecol_width), 4);
485  qual_width = std::max(width4num(qual_width), 4);
486  grps_width = std::max(width4num(grps_width), 4);
487  }
488 
489  int sprint_centigrade_list(char *buffer, const int *perc) const {
490  // format centigrade-decrement-list
491  int printed = 0;
492  for (int i = 0; i<PERC_SIZE; ++i) {
493  buffer[printed++] = ' ';
494  printed += perc[i]
495  ? sprintf(buffer+printed, "%*i", width[i], perc[i])
496  : sprintf(buffer+printed, "%*s", width[i], "-");
497  }
498  return printed;
499  }
500 
501  int get_max_designed_len() const { return maxprobelen; }
502 
503  int get_apos_width() const { return apos_width; }
504  int get_ecol_width() const { return ecol_width; }
505  int get_qual_width() const { return qual_width; }
506  int get_grps_width() const { return grps_width; }
507 };
508 
509 typedef std::map<const PT_pdc*const, PD_formatter> PD_Formatter4design;
510 static PD_Formatter4design format4design;
511 
512 inline const PD_formatter& get_formatter(const PT_pdc *pdc) {
513  PD_Formatter4design::iterator found = format4design.find(pdc);
514  if (found == format4design.end()) {
515  format4design[pdc] = PD_formatter(pdc);
516  found = format4design.find(pdc);
517  }
518  pt_assert(found != format4design.end());
519 
520  return found->second;
521 }
522 inline void erase_formatter(const PT_pdc *pdc) { format4design.erase(pdc); }
523 
524 char *get_design_info(const PT_tprobes *tprobe) {
525  pt_assert(tprobe);
526 
527  const int BUFFERSIZE = 2000;
528  char *buffer = (char *)GB_give_buffer(BUFFERSIZE);
529  PT_pdc *pdc = (PT_pdc *)tprobe->mh.parent->parent;
530  char *p = buffer;
531  const PD_formatter& formatter = get_formatter(pdc);
532 
533  // target
534  {
535  int len = tprobe->seq_len;
536  char *probe = (char *)GB_give_buffer2(len+1);
537  strcpy(probe, tprobe->sequence);
538 
539  probe_2_readable(probe, len); // convert probe to real ASCII
540  p += sprintf(p, "%-*s", formatter.get_max_designed_len()+1, probe);
541  }
542 
543  {
544  int apos = shown_apos(tprobe);
545  int c;
546  int cs = 0;
547 
548  // search nearest previous hit
549  for (c=0; c<ALPHA_SIZE; c++) {
550  if (!pdc->pos_groups[c]) { // new group
551  pdc->pos_groups[c] = apos;
552  cs = '=';
553  break;
554  }
555  int dist = abs(apos - pdc->pos_groups[c]);
556  if (dist < pdc->min_probelen) {
557  apos = apos - pdc->pos_groups[c];
558  if (apos >= 0) {
559  cs = '+';
560  break;
561  }
562  cs = '-';
563  apos = -apos;
564  break;
565  }
566  }
567 
568  if (cs) {
569  c = hitgroup_idx2char(c);
570  }
571  else {
572  // ran out of pos_groups
573  c = '?';
574  cs = ' ';
575  }
576 
577  // le apos ecol
578  p += sprintf(p, "%2i ", tprobe->seq_len);
579  p += sprintf(p, "%c%c%*i ", c, cs, formatter.get_apos_width(), apos);
580  p += sprintf(p, "%*i ", formatter.get_ecol_width(), shown_ecoli(tprobe)); // ecoli-bases inclusive apos ("bases before apos+1")
581  }
582 
583  p += sprintf(p, "%*i ", formatter.get_qual_width(), shown_qual(tprobe));
584  p += sprintf(p, "%*i ", formatter.get_grps_width(), tprobe->groupsize);
585  p += sprintf(p, "%5.1f", ((double)pt_get_gc_content(tprobe->sequence))/tprobe->seq_len*100.0); // G+C
586  p += sprintf(p, "%5.1f ", pt_get_temperature(tprobe->sequence)); // temperature
587 
588  // probe string
589  {
590  char *probe = create_reversed_probe(tprobe->sequence, tprobe->seq_len);
591  complement_probe(probe, tprobe->seq_len);
592  probe_2_readable(probe, tprobe->seq_len); // convert probe to real ASCII
593  p += sprintf(p, "%*s |", formatter.get_max_designed_len(), probe);
594  free(probe);
595  }
596 
597  // non-group hits by temp. decrease
598  p += formatter.sprint_centigrade_list(p, tprobe->perc);
599  if (!tprobe->next) erase_formatter(pdc); // erase formatter when done with last probe
600 
601  pt_assert((p-buffer)<BUFFERSIZE);
602  return buffer;
603 }
604 
605 char *get_design_hinfo(const PT_pdc *pdc) {
606  const int BUFFERSIZE = 2000;
607  char *buffer = (char *)GB_give_buffer(BUFFERSIZE);
608  char *s = buffer;
609 
610  const PD_formatter& formatter = get_formatter(pdc);
611 
612  {
613  char *ecolipos = NULp;
614  if (pdc->min_ecolipos == -1) {
615  if (pdc->max_ecolipos == -1) {
616  ecolipos = ARB_strdup("any");
617  }
618  else {
619  ecolipos = GBS_global_string_copy("<= %i", pdc->max_ecolipos);
620  }
621  }
622  else {
623  if (pdc->max_ecolipos == -1) {
624  ecolipos = GBS_global_string_copy(">= %i", pdc->min_ecolipos);
625  }
626  else {
627  ecolipos = GBS_global_string_copy("%4i -%5i", pdc->min_ecolipos, pdc->max_ecolipos);
628  }
629  }
630 
631  char *mishit_annotation = NULp;
632  if (pdc->min_rj_mishits<INT_MAX) {
633  mishit_annotation = GBS_global_string_copy(" (lowest rejected nongroup hits: %i)", pdc->min_rj_mishits);
634  }
635 
636  char *coverage_annotation = NULp;
637  if (pdc->max_rj_coverage>0.0) {
638  coverage_annotation = GBS_global_string_copy(" (max. rejected coverage: %.0f%%)", pdc->max_rj_coverage*100.0);
639  }
640 
641  char *probelen_info;
642  if (get_max_probelen(pdc) == pdc->min_probelen) {
643  probelen_info = GBS_global_string_copy("%i", pdc->min_probelen);
644  }
645  else {
646  probelen_info = GBS_global_string_copy("%i-%i", pdc->min_probelen, get_max_probelen(pdc));
647  }
648 
649  s += sprintf(s,
650  "Probe design parameters:\n"
651  "Length of probe %s\n"
652  "Temperature [%4.1f -%5.1f]\n"
653  "GC-content [%4.1f -%5.1f]\n"
654  "E.Coli position [%s]\n"
655  "Max. nongroup hits %i%s\n"
656  "Min. group hits %.0f%%%s\n",
657  probelen_info,
658  pdc->mintemp, pdc->maxtemp,
659  pdc->min_gc*100.0, pdc->max_gc*100.0,
660  ecolipos,
661  pdc->max_mishits, null2empty(mishit_annotation),
662  pdc->min_coverage*100.0, null2empty(coverage_annotation));
663 
664  free(probelen_info);
665  free(coverage_annotation);
666  free(mishit_annotation);
667 
668  free(ecolipos);
669  }
670 
671  if (pdc->tprobes) {
672  int maxprobelen = formatter.get_max_designed_len();
673 
674  s += sprintf(s, "%-*s", maxprobelen+1, "Target");
675  s += sprintf(s, "le ");
676  s += sprintf(s, "%*s ", formatter.get_apos_width()+2, "apos");
677  s += sprintf(s, "%*s ", formatter.get_ecol_width(), "ecol");
678  s += sprintf(s, "%*s ", formatter.get_qual_width(), "qual");
679  s += sprintf(s, "%*s ", formatter.get_grps_width(), "grps");
680  s += sprintf(s, " G+C temp ");
681  s += sprintf(s, "%*s | ", maxprobelen, maxprobelen<14 ? "Probe" : "Probe sequence");
682  s += sprintf(s, "Decrease T by n*.3C -> probe matches n non group species");
683  }
684  else {
685  s += sprintf(s, "No probes found!");
686  erase_formatter(pdc);
687  }
688 
689  pt_assert((s-buffer)<BUFFERSIZE);
690 
691  return buffer;
692 }
693 
695  int mishits;
696  const char *probe;
697  int height;
698 
699  ptnd_chain_count_mishits() : mishits(0), probe(NULp), height(0) {}
700  ptnd_chain_count_mishits(const char *probe_, int height_) : mishits(0), probe(probe_), height(height_) {}
701 
702  int operator()(const AbsLoc& probeLoc) {
703  // count all mishits for a probe
704 
705  psg.abs_pos.announce(probeLoc.get_abs_pos());
706 
707  if (probeLoc.get_pid().outside_group()) {
708  if (probe) {
709  pt_assert(probe[0]); // if this case occurs, avoid entering this branch
710 
711  DataLoc dataLoc(probeLoc);
712  ReadableDataLoc readableLoc(dataLoc);
713  for (int i = 0; probe[i] && readableLoc[height+i]; ++i) {
714  if (probe[i] != readableLoc[height+i]) return 0;
715  }
716  }
717  mishits++;
718  }
719  return 0;
720  }
721 };
722 
725  if (!pt)
726  return 0;
727 
728  if (pt->is_leaf()) {
729  DataLoc loc(pt);
731  return loc.get_pid().outside_group();
732  }
733 
734  if (pt->is_chain()) {
736  PT_forwhole_chain(pt, counter); // @@@ expand
737  return counter.mishits;
738  }
739 
740  int mishits = 0;
741  for (int base = PT_QU; base< PT_BASES; base++) {
742  mishits += count_mishits_for_all(PT_read_son(pt, (PT_base)base));
743  }
744  return mishits;
745 }
746 
747 static int count_mishits_for_matched(char *probe, POS_TREE2 *pt, int height) {
749  if (!pt) return 0;
750  if (pt->is_node() && *probe) {
751  int mishits = 0;
752  for (int i=PT_A; i<PT_BASES; i++) {
753  if (i != *probe) continue;
754  POS_TREE2 *pthelp = PT_read_son(pt, (PT_base)i);
755  if (pthelp) mishits += count_mishits_for_matched(probe+1, pthelp, height+1);
756  }
757  return mishits;
758  }
759  if (*probe) {
760  if (pt->is_leaf()) {
761  const ReadableDataLoc loc(pt);
762 
763  int pos = loc.get_rel_pos()+height;
764 
765  if (pos + (int)(strlen(probe)) >= loc.get_pid().get_size()) // after end // @@@ wrong check ? better return from loop below when ref is PT_QU
766  return 0;
767 
768  for (int i = 0; probe[i] && loc[height+i]; ++i) {
769  if (probe[i] != loc[height+i]) {
770  return 0;
771  }
772  }
773  }
774  else { // chain
775  ptnd_chain_count_mishits counter(probe, height);
776  PT_forwhole_chain(pt, counter); // @@@ expand
777  return counter.mishits;
778  }
779  }
780  return count_mishits_for_all(pt);
781 }
782 
783 static void remove_tprobes_with_too_many_mishits(PT_pdc *pdc) {
785  // tracks minimum rejected mishits in 'pdc->min_rj_mishits'
786  int min_rej_mishit_amount = INT_MAX;
787 
788  for (PT_tprobes *tprobe = pdc->tprobes; tprobe; ) {
789  PT_tprobes *tprobe_next = tprobe->next;
790 
791  psg.abs_pos.clear();
792  tprobe->mishits = count_mishits_for_matched(tprobe->sequence, psg.TREE_ROOT2(), 0);
793  tprobe->apos = psg.abs_pos.get_most_used();
794  if (tprobe->mishits > pdc->max_mishits) {
795  min_rej_mishit_amount = std::min(min_rej_mishit_amount, tprobe->mishits);
796  destroy_PT_tprobes(tprobe);
797  }
798  tprobe = tprobe_next;
799  }
800  psg.abs_pos.clear();
801 
802  pdc->min_rj_mishits = std::min(pdc->min_rj_mishits, min_rej_mishit_amount);
803 }
804 
805 static void remove_tprobes_outside_ecoli_range(PT_pdc *pdc) {
807  // if (pdc->min_ecolipos == pdc->max_ecolipos) return; // @@@ wtf was this for?
808 
809  for (PT_tprobes *tprobe = pdc->tprobes; tprobe; ) {
810  PT_tprobes *tprobe_next = tprobe->next;
811  long relpos = PT_abs_2_ecoli_rel(tprobe->apos+1);
812  if (relpos < pdc->min_ecolipos || (relpos > pdc->max_ecolipos && pdc->max_ecolipos != -1)) {
813  destroy_PT_tprobes(tprobe);
814  }
815  tprobe = tprobe_next;
816  }
817 }
818 
819 static size_t tprobes_calculate_bonds(PT_local *locs) {
826  PT_pdc *pdc = locs->pdc;
827  size_t count = 0;
828 
829  MaxBond mbond(locs->bond);
830 
831  for (PT_tprobes *tprobe = pdc->tprobes; tprobe; ) {
832  PT_tprobes *tprobe_next = tprobe->next;
833  tprobe->seq_len = strlen(tprobe->sequence);
834  double sbond = 0.0;
835  for (int i=0; i<tprobe->seq_len; i++) {
836  sbond += mbond.get_max_bond(tprobe->sequence[i]);
837  }
838  tprobe->sum_bonds = sbond;
839 
840  ++count;
841  tprobe = tprobe_next;
842  }
843  return count;
844 }
845 
846 class CentigradePos { // track minimum centigrade position for each species
847  typedef std::map<int, int> Pos4Name;
848 
849  Pos4Name cpos; // key = outgroup-species-id; value = min. centigrade position for (partial) probe
850 
851 public:
852 
853  static bool is_valid_pos(int pos) { return pos >= 0 && pos<PERC_SIZE; }
854 
855  void set_pos_for(int name, int pos) {
856  pt_assert(is_valid_pos(pos)); // otherwise it should not be put into CentigradePos
857  Pos4Name::iterator found = cpos.find(name);
858  if (found == cpos.end()) {
859  cpos[name] = pos;
860  }
861  else if (pos<found->second) {
862  found->second = pos;
863  }
864  }
865 
866  void summarize_centigrade_hits(int *centigrade_hits) const {
867  for (int i = 0; i<PERC_SIZE; ++i) centigrade_hits[i] = 0;
868  for (Pos4Name::const_iterator p = cpos.begin(); p != cpos.end(); ++p) {
869  int pos = p->second;
870  pt_assert(is_valid_pos(pos)); // otherwise it should not be put into CentigradePos
871  centigrade_hits[pos]++;
872  }
873  }
874 
875  bool empty() const { return cpos.empty(); }
876 };
877 
878 class OutgroupMatcher : virtual Noncopyable {
879  const PT_pdc *const pdc;
880 
881  Splits splits;
882 
883  PT_tprobes *currTprobe;
884  CentigradePos result;
885 
886  bool only_bind_behind_dot; // true for suffix-matching
887 
888  void uncond_announce_match(int centPos, const AbsLoc& loc) {
890 #if defined(DUMP_OLIGO_MATCHING)
891  fprintf(stderr, "announce_match centPos=%2i loc", centPos);
892  loc.dump(stderr);
893  fflush_all();
894 #endif
895  result.set_pos_for(loc.get_name(), centPos);
896  }
897 
898  bool location_follows_dot(const ReadableDataLoc& loc) const { return loc[-1] == PT_QU; }
899  bool location_follows_dot(const DataLoc& loc) const { return location_follows_dot(ReadableDataLoc(loc)); } // very expensive @@@ optimize using relpos
900  bool location_follows_dot(const AbsLoc& loc) const { return location_follows_dot(ReadableDataLoc(DataLoc(loc))); } // very very expensive
901 
902  template<typename LOC>
903  bool acceptable_location(const LOC& loc) const {
904  return loc.get_pid().outside_group() &&
905  (only_bind_behind_dot ? location_follows_dot(loc) : true);
906  }
907 
908  template<typename LOC>
909  void announce_match_if_acceptable(int centPos, const LOC& loc) {
910  if (acceptable_location(loc)) {
911  uncond_announce_match(centPos, loc);
912  }
913  }
914  void announce_match_if_acceptable(int centPos, POS_TREE2 *pt) {
915  switch (pt->get_type()) {
916  case PT2_NODE:
917  for (int i = PT_QU; i<PT_BASES; ++i) {
918  POS_TREE2 *ptson = PT_read_son(pt, (PT_base)i);
919  if (ptson) {
920  announce_match_if_acceptable(centPos, ptson);
921  }
922  }
923  break;
924 
925  case PT2_LEAF:
926  announce_match_if_acceptable(centPos, DataLoc(pt));
927  break;
928 
929  case PT2_CHAIN: {
930  ChainIteratorStage2 iter(pt);
931  while (iter) {
932  announce_match_if_acceptable(centPos, iter.at());
933  ++iter;
934  }
935  break;
936  }
937  }
938  }
939 
940  template <typename PT_OR_LOC>
941  void announce_possible_match(const MatchingOligo& oligo, PT_OR_LOC pt_or_loc) {
942  pt_assert(oligo.domainSeen());
943  pt_assert(!oligo.dangling());
944 
945  int centPos = oligo.calc_centigrade_pos(currTprobe, pdc);
946  if (centPos<PERC_SIZE) {
947  announce_match_if_acceptable(centPos, pt_or_loc);
948  }
949  }
950 
951  bool might_reach_centigrade_pos(const MatchingOligo& oligo) const {
952  return !oligo.centigrade_pos_out_of_reach(currTprobe, pdc, splits);
953  }
954 
955  void bind_rest(const MatchingOligo& oligo, const ReadableDataLoc& loc, const int height) {
956  // unconditionally bind rest of oligo versus sequence
957 
958  pt_assert(oligo.domainSeen()); // entry-invariant for bind_rest()
959  pt_assert(oligo.dangling()); // otherwise ReadableDataLoc is constructed w/o need (very expensive!)
960  pt_assert(might_reach_centigrade_pos(oligo)); // otherwise ReadableDataLoc is constructed w/o need (very expensive!)
961  pt_assert(loc.get_pid().outside_group()); // otherwise we are not interested in the result
962 
963  if (loc[height]) {
964  MatchingOligo more = oligo.bind_against(loc[height], splits);
965  pt_assert(more.domainSeen()); // implied by oligo.domainSeen()
966  if (more.dangling()) {
967  if (might_reach_centigrade_pos(more)) {
968  bind_rest(more, loc, height+1);
969  }
970  }
971  else {
972  announce_possible_match(more, loc);
973  }
974  }
975  else {
976  MatchingOligo all = oligo.dont_bind_rest();
977  announce_possible_match(all, loc);
978  }
979  }
980  void bind_rest_if_outside_group(const MatchingOligo& oligo, const DataLoc& loc, const int height) {
981  if (loc.get_pid().outside_group() && might_reach_centigrade_pos(oligo)) {
982  bind_rest(oligo, ReadableDataLoc(loc), height);
983  }
984  }
985  void bind_rest_if_outside_group(const MatchingOligo& oligo, const AbsLoc& loc, const int height) {
986  if (loc.get_pid().outside_group() && might_reach_centigrade_pos(oligo)) {
987  bind_rest(oligo, ReadableDataLoc(DataLoc(loc)), height);
988  }
989  }
990 
991  void bind_rest(const MatchingOligo& oligo, POS_TREE2 *pt, const int height) {
992  // unconditionally bind rest of oligo versus complete index-subtree
993  pt_assert(oligo.domainSeen()); // entry-invariant for bind_rest()
994 
995  if (oligo.dangling()) {
996  if (might_reach_centigrade_pos(oligo)) {
997  switch (pt->get_type()) {
998  case PT2_NODE: {
999  POS_TREE2 *ptdotson = PT_read_son(pt, PT_QU);
1000  if (ptdotson) {
1001  MatchingOligo all = oligo.dont_bind_rest();
1002  announce_possible_match(all, ptdotson);
1003  }
1004 
1005  for (int i = PT_N; i<PT_BASES; ++i) {
1006  POS_TREE2 *ptson = PT_read_son(pt, (PT_base)i);
1007  if (ptson) {
1008  bind_rest(oligo.bind_against(i, splits), ptson, height+1);
1009  }
1010  }
1011  break;
1012  }
1013  case PT2_LEAF:
1014  bind_rest_if_outside_group(oligo, DataLoc(pt), height);
1015  break;
1016 
1017  case PT2_CHAIN:
1018  ChainIteratorStage2 iter(pt);
1019  while (iter) {
1020  bind_rest_if_outside_group(oligo, iter.at(), height);
1021  ++iter;
1022  }
1023  break;
1024  }
1025  }
1026  }
1027  else {
1028  announce_possible_match(oligo, pt);
1029  }
1030  }
1031 
1032  void bind_till_domain(const MatchingOligo& oligo, const ReadableDataLoc& loc, const int height) {
1033  pt_assert(!oligo.domainSeen() && oligo.domainPossible()); // entry-invariant for bind_till_domain()
1034  pt_assert(oligo.dangling()); // otherwise ReadableDataLoc is constructed w/o need (very expensive!)
1035  pt_assert(might_reach_centigrade_pos(oligo)); // otherwise ReadableDataLoc is constructed w/o need (very expensive!)
1036  pt_assert(loc.get_pid().outside_group()); // otherwise we are not interested in the result
1037 
1038  if (is_std_base_or_N(loc[height])) { // do not try to bind domain versus dot
1039  MatchingOligo more = oligo.bind_against(loc[height], splits);
1040  if (more.dangling()) {
1041  if (might_reach_centigrade_pos(more)) {
1042  if (more.domainSeen()) bind_rest (more, loc, height+1);
1043  else if (more.domainPossible()) bind_till_domain(more, loc, height+1);
1044  }
1045  }
1046  else if (more.domainSeen()) {
1047  announce_possible_match(more, loc);
1048  }
1049  }
1050  }
1051  void bind_till_domain_if_outside_group(const MatchingOligo& oligo, const DataLoc& loc, const int height) {
1052  if (loc.get_pid().outside_group() && might_reach_centigrade_pos(oligo)) {
1053  bind_till_domain(oligo, ReadableDataLoc(loc), height);
1054  }
1055  }
1056  void bind_till_domain_if_outside_group(const MatchingOligo& oligo, const AbsLoc& loc, const int height) {
1057  if (loc.get_pid().outside_group() && might_reach_centigrade_pos(oligo)) {
1058  bind_till_domain(oligo, ReadableDataLoc(DataLoc(loc)), height);
1059  }
1060  }
1061 
1062  void bind_till_domain(const MatchingOligo& oligo, POS_TREE2 *pt, const int height) {
1063  pt_assert(!oligo.domainSeen() && oligo.domainPossible()); // entry-invariant for bind_till_domain()
1064  pt_assert(oligo.dangling());
1065 
1066  if (might_reach_centigrade_pos(oligo)) {
1067  switch (pt->get_type()) {
1068  case PT2_NODE:
1069  for (int i = PT_A; i<PT_BASES; ++i) {
1070  POS_TREE2 *ptson = PT_read_son(pt, (PT_base)i);
1071  if (ptson) {
1072  MatchingOligo sonOligo = oligo.bind_against(i, splits);
1073 
1074  if (sonOligo.domainSeen()) bind_rest (sonOligo, ptson, height+1);
1075  else if (sonOligo.domainPossible()) bind_till_domain(sonOligo, ptson, height+1);
1076  }
1077  }
1078  break;
1079 
1080  case PT2_LEAF:
1081  bind_till_domain_if_outside_group(oligo, DataLoc(pt), height);
1082  break;
1083 
1084  case PT2_CHAIN:
1085  ChainIteratorStage2 iter(pt);
1086  while (iter) {
1087  bind_till_domain_if_outside_group(oligo, iter.at(), height);
1088  ++iter;
1089  }
1090  break;
1091  }
1092  }
1093  }
1094 
1095  void reset() { result = CentigradePos(); }
1096 
1097 public:
1098  OutgroupMatcher(PT_local *locs_, PT_pdc *pdc_)
1099  : pdc(pdc_),
1100  splits(locs_),
1101  currTprobe(NULp),
1102  only_bind_behind_dot(false)
1103  {}
1104 
1105  void calculate_outgroup_matches(PT_tprobes& tprobe) {
1106  LocallyModify<PT_tprobes*> assign_tprobe(currTprobe, &tprobe);
1107  pt_assert(result.empty());
1108 
1109  MatchingOligo fullProbe(Oligo(tprobe.sequence, tprobe.seq_len));
1110  POS_TREE2 *ptroot = psg.TREE_ROOT2();
1111 
1112  pt_assert(!fullProbe.domainSeen());
1113  pt_assert(fullProbe.domainPossible()); // probe length too short (probes shorter than DOMAIN_MIN_LENGTH always report 0 outgroup hits -> wrong result)
1114 
1115  arb_progress progress(long(tprobe.seq_len));
1116 
1117  only_bind_behind_dot = false;
1118  bind_till_domain(fullProbe, ptroot, 0);
1119  ++progress;
1120 
1121  // match all suffixes of probe
1122  // - detect possible partial matches at start of alignment (and behind dots inside the sequence)
1123  only_bind_behind_dot = true;
1124  for (int off = 1; off<tprobe.seq_len; ++off) {
1125  MatchingOligo probeSuffix(fullProbe.get_oligo().suffix(off));
1126  if (!probeSuffix.domainPossible()) break; // abort - no smaller suffix may create a domain
1127  bind_till_domain(probeSuffix, ptroot, 0);
1128  ++progress;
1129  }
1130 
1131  result.summarize_centigrade_hits(tprobe.perc);
1132  progress.done();
1133 
1134  reset();
1135  }
1136 };
1137 
1139  const char *seq;
1140 
1141 public:
1142  ProbeOccurrence(const char *seq_) : seq(seq_) {}
1143 
1144  const char *sequence() const { return seq; }
1145 
1146  void forward() { ++seq; }
1147 
1148  bool less(const ProbeOccurrence& other, size_t len) const {
1149  const char *mine = sequence();
1150  const char *his = other.sequence();
1151 
1152  int cmp = 0;
1153  for (size_t i = 0; i<len && !cmp; ++i) {
1154  cmp = mine[i]-his[i];
1155  }
1156  return cmp<0;
1157  }
1158 };
1159 
1161  ProbeOccurrence cursor;
1162 
1163  size_t probelen;
1164  size_t rest; // size of seqpart behind cursor
1165  size_t gc, at; // count G+C and A+T
1166 
1167  bool has_only_std_bases() const { return (gc+at) == probelen; }
1168 
1169  bool at_end() const { return !rest; }
1170 
1171  PT_base base_at(size_t offset) const {
1172  pt_assert(offset < probelen);
1173  return PT_base(cursor.sequence()[offset]);
1174  }
1175 
1176  void forget(PT_base b) {
1177  switch (b) {
1178  case PT_A: case PT_T: pt_assert(at>0); --at; break;
1179  case PT_C: case PT_G: pt_assert(gc>0); --gc; break;
1180  default : break;
1181  }
1182  }
1183  void count(PT_base b) {
1184  switch (b) {
1185  case PT_A: case PT_T: ++at; break;
1186  case PT_C: case PT_G: ++gc; break;
1187  default : break;
1188  }
1189  }
1190 
1191  void init_counts() {
1192  at = gc = 0;
1193  for (size_t i = 0; i<probelen; ++i) {
1194  count(base_at(i));
1195  }
1196  }
1197 
1198 public:
1199  ProbeIterator(const char *seq, size_t seqlen, size_t probelen_)
1200  : cursor(seq),
1201  probelen(probelen_),
1202  rest(seqlen-probelen)
1203  {
1204  pt_assert(seqlen >= probelen);
1205  init_counts();
1206  }
1207 
1208  bool next() {
1209  if (at_end()) return false;
1210  forget(base_at(0));
1211  cursor.forward();
1212  --rest;
1213  count(base_at(probelen-1));
1214  return true;
1215  }
1216 
1217  int get_temperature() const { return 2*at + 4*gc; }
1218 
1219  bool gc_in_wanted_range(PT_pdc *pdc) const {
1220  return pdc->min_gc*probelen <= gc && gc <= pdc->max_gc*probelen;
1221  }
1222  bool temperature_in_wanted_range(PT_pdc *pdc) const {
1223  int temp = get_temperature();
1224  return pdc->mintemp <= temp && temp <= pdc->maxtemp;
1225  }
1226 
1227  bool feasible(PT_pdc *pdc) const {
1228  return has_only_std_bases() &&
1229  gc_in_wanted_range(pdc) &&
1231  }
1232 
1233  const ProbeOccurrence& occurrence() const { return cursor; }
1234 
1235  void dump(FILE *out) const {
1236  char *probe = readable_probe(cursor.sequence(), probelen, 'T');
1237  fprintf(out, "probe='%s' probelen=%zu at=%zu gc=%zu\n", probe, probelen, at, gc);
1238  free(probe);
1239  }
1240 };
1241 
1242 class PO_Less {
1243  size_t probelen;
1244 public:
1245  PO_Less(size_t probelen_) : probelen(probelen_) {}
1246  bool operator()(const ProbeOccurrence& a, const ProbeOccurrence& b) const { return a.less(b, probelen); }
1247 };
1248 
1249 struct SCP_Less {
1250  bool operator()(const SmartCharPtr& p1, const SmartCharPtr& p2) { return &*p1 < &*p2; } // simply compare addresses
1251 };
1252 
1254  typedef std::set<ProbeOccurrence, PO_Less> Candidates;
1255  typedef std::map<ProbeOccurrence, int, PO_Less> CandidateHits;
1256  typedef std::set<SmartCharPtr, SCP_Less> SeqData;
1257 
1258  size_t probelen;
1259  CandidateHits candidateHits;
1260  SeqData data; // locks SmartPtrs to input data (ProbeOccurrences point inside their data)
1261 
1262 public:
1263  ProbeCandidates(size_t probelen_)
1264  : probelen(probelen_),
1265  candidateHits(probelen)
1266  {}
1267 
1268  void generate_for_sequence(PT_pdc *pdc, const SmartCharPtr& seq, size_t bp) {
1269  arb_progress base_progress(2*bp);
1270  if (bp >= probelen) {
1271  // collect all probe candidates for current sequence
1272  Candidates candidates(probelen);
1273  {
1274  data.insert(seq);
1275  ProbeIterator probe(&*seq, bp, probelen);
1276  do {
1277  if (probe.feasible(pdc)) {
1278  candidates.insert(probe.occurrence());
1279  ++base_progress;
1280  }
1281  } while (probe.next());
1282  }
1283 
1284  // increment overall hitcount for each found candidate
1285  for (Candidates::iterator c = candidates.begin(); c != candidates.end(); ++c) {
1286  candidateHits[*c]++;
1287  ++base_progress;
1288  }
1289  }
1290  base_progress.done();
1291  }
1292 
1293  void create_tprobes(PT_pdc *pdc, int ingroup_size) {
1294  // tracks maximum rejected target-group coverage
1295  int min_ingroup_hits = (ingroup_size * pdc->min_coverage + .5);
1296  int max_rej_ingroup_hits = 0;
1297 
1298  for (CandidateHits::iterator c = candidateHits.begin(); c != candidateHits.end(); ++c) {
1299  int ingroup_hits = c->second;
1300  if (ingroup_hits >= min_ingroup_hits) {
1301  const ProbeOccurrence& candi = c->first;
1302 
1303  PT_tprobes *tprobe = create_PT_tprobes();
1304 
1305  tprobe->sequence = ARB_strndup(candi.sequence(), probelen);
1306  tprobe->temp = pt_get_temperature(tprobe->sequence);
1307  tprobe->groupsize = ingroup_hits;
1308 
1309  aisc_link(&pdc->ptprobes, tprobe);
1310  }
1311  else {
1312  max_rej_ingroup_hits = std::max(max_rej_ingroup_hits, ingroup_hits);
1313  }
1314  }
1315 
1316  double max_rejected_coverage = max_rej_ingroup_hits/double(ingroup_size);
1317  pdc->max_rj_coverage = std::max(pdc->max_rj_coverage, max_rejected_coverage);
1318  }
1319 };
1320 
1321 class DesignTargets : virtual Noncopyable {
1322  PT_pdc *pdc;
1323 
1324  int targets; // overall target sequence count
1325  int known_targets; // known target sequences
1326  int added_targets; // added (=unknown) target sequences
1327 
1328  long datasize; // overall bp of target sequences
1329 
1330  int min_probe_length; // min. designed probe length
1331 
1332  int *known2id;
1333 
1334  ARB_ERROR error;
1335 
1336  void announce_seq_bp(long bp) {
1337  pt_assert(!error);
1338  if (bp<long(min_probe_length)) {
1339  error = GBS_global_string("Sequence contains only %lu bp. Impossible design request for", bp);
1340  }
1341  else {
1342  datasize += bp;
1343  if (datasize<bp) datasize = ULONG_MAX;
1344  }
1345  }
1346 
1347  void scan_added_targets() {
1348  added_targets = 0;
1349  for (PT_sequence *seq = pdc->sequences; seq && !error; seq = seq->next) {
1350  announce_seq_bp(seq->seq.size);
1351  ++added_targets;
1352  }
1353  if (error) error = GBS_global_string("%s one of the added sequences", error.deliver());
1354  }
1355  void scan_known_targets(int expected_known_targets) {
1356  known2id = new int[expected_known_targets];
1357  known_targets = 0;
1358 
1359  for (int id = 0; id < psg.data_count && !error; ++id) {
1360  const probe_input_data& pid = psg.data[id];
1361  if (pid.inside_group()) {
1362  announce_seq_bp(pid.get_size());
1363  known2id[known_targets++] = id;
1364  pt_assert(known_targets <= expected_known_targets);
1365  if (error) error = GBS_global_string("%s species '%s'", error.deliver(), pid.get_shortname());
1366  }
1367  }
1368  }
1369 
1370 public:
1371  DesignTargets(PT_pdc *pdc_, int expected_known_targets)
1372  : pdc(pdc_),
1373  targets(0),
1374  known_targets(0),
1375  datasize(0),
1376  min_probe_length(pdc->min_probelen),
1377  known2id(NULp)
1378  {
1379  scan_added_targets(); // calc added_targets
1380  if (!error) {
1381  scan_known_targets(expected_known_targets);
1382  targets = known_targets+added_targets;
1383  }
1384  }
1386  delete [] known2id;
1387  }
1388  ARB_ERROR& get_error() { return error; } // needs to be checked after construction!
1389 
1390  long get_datasize() const { return datasize; }
1391  int get_count() const { return targets; }
1392  int get_known_count() const { return known_targets; }
1393  int get_added_count() const { return added_targets; }
1394 
1395  void generate(ProbeCandidates& candidates) {
1396  arb_progress progress(static_cast<long>(get_count()));
1397  for (int k = 0; k<known_targets; ++k) {
1398  const probe_input_data& pid = psg.data[known2id[k]];
1399  candidates.generate_for_sequence(pdc, pid.get_dataPtr(), pid.get_size());
1400  ++progress;
1401  }
1402  for (PT_sequence *seq = pdc->sequences; seq; seq = seq->next) {
1403  candidates.generate_for_sequence(pdc, ARB_strndup(seq->seq.data, seq->seq.size), seq->seq.size);
1404  ++progress;
1405  }
1406  }
1407 };
1408 
1409 #if defined(DUMP_DESIGNED_PROBES)
1410 static void dump_tprobe(PT_tprobes *tprobe, int idx) {
1411  char *readable = readable_probe(tprobe->sequence, tprobe->seq_len, 'T');
1412  fprintf(stderr, "tprobe='%s' idx=%i len=%i\n", readable, idx, tprobe->seq_len);
1413  free(readable);
1414 }
1415 static void DUMP_TPROBES(const char *where, PT_pdc *pdc) {
1416  int idx = 0;
1417  fprintf(stderr, "dumping tprobes %s:\n", where);
1418  for (PT_tprobes *tprobe = pdc->tprobes; tprobe; tprobe = tprobe->next) {
1419  dump_tprobe(tprobe, idx++);
1420  }
1421 }
1422 #else
1423 #define DUMP_TPROBES(a,b)
1424 #endif
1425 
1426 int PT_start_design(PT_pdc *pdc, int /* dummy */) {
1427  PT_local *locs = (PT_local*)pdc->mh.parent->parent;
1428 
1429  erase_formatter(pdc);
1430  while (pdc->tprobes) destroy_PT_tprobes(pdc->tprobes);
1431 
1432  for (PT_sequence *seq = pdc->sequences; seq; seq = seq->next) { // Convert all external sequence to internal format
1433  seq->seq.size = probe_compress_sequence(seq->seq.data, seq->seq.size-1); // no longer convert final zero-byte (PT_QU gets auto-appended)
1434  }
1435 
1436  ARB_ERROR error;
1437  int unknown_species_count = 0;
1438  {
1439  char *unknown_names = ptpd_read_names(locs, pdc->names.data, pdc->checksums.data, error); // read the names
1440  if (unknown_names) {
1442  GBT_splitNdestroy_string(names, unknown_names, "#", SPLIT_DROPEMPTY);
1443  pt_assert(!unknown_names);
1444  unknown_species_count = names.size();
1445  }
1446  }
1447 
1448  if (!error) {
1449  DesignTargets targets(pdc, locs->group_count); // here locs->group_count is amount of known marked species/genes
1450  error = targets.get_error();
1451 
1452  if (!error) {
1453  locs->group_count = targets.get_count();
1454  if (locs->group_count <= 0) {
1455  error = GBS_global_string("No %s marked - no probes designed", gene_flag ? "genes" : "species");
1456  }
1457  else if (targets.get_added_count() != unknown_species_count) {
1458  if (gene_flag) { // cannot add genes
1459  pt_assert(targets.get_added_count() == 0); // cannot, but did add?!
1460  error = GBS_global_string("Cannot design probes for %i unknown marked genes", unknown_species_count);
1461  }
1462  else {
1463  int added = targets.get_added_count();
1464  error = GBS_global_string("Got %i unknown marked species, but %i custom sequence%s added (has to match)",
1465  unknown_species_count,
1466  added,
1467  added == 1 ? " was" : "s were");
1468  }
1469  }
1470  else if (pdc->min_probelen < MIN_DESIGN_PROBE_LENGTH) {
1471  error = GBS_global_string("Specified min. probe length %i is below the min. allowed probe length of %i", pdc->min_probelen, MIN_DESIGN_PROBE_LENGTH);
1472  }
1473  else if (get_max_probelen(pdc) < pdc->min_probelen) {
1474  error = GBS_global_string("Max. probe length %i is below the specified min. probe length of %i", get_max_probelen(pdc), pdc->min_probelen);
1475  }
1476  else {
1477  // search for possible probes
1478  if (!error) {
1479  int min_probelen = pdc->min_probelen;
1480  int max_probelen = get_max_probelen(pdc);
1481 
1482  arb_progress progress("Searching probe candidates", long(max_probelen-min_probelen+1));
1483  for (int len = min_probelen; len <= max_probelen; ++len) {
1484  ProbeCandidates candidates(len);
1485 
1486  targets.generate(candidates);
1487  pt_assert(!targets.get_error());
1488 
1489  candidates.create_tprobes(pdc, targets.get_count());
1490  ++progress;
1491  }
1492  }
1493 
1494  while (pdc->sequences) destroy_PT_sequence(pdc->sequences);
1495 
1496  if (!error) {
1500  size_t tprobes = tprobes_calculate_bonds(locs);
1501  DUMP_TPROBES("after tprobes_calculate_bonds", pdc);
1502 
1503  if (tprobes) {
1504  arb_progress progress(GBS_global_string("Calculating probe quality for %zu probe candidates", tprobes), tprobes);
1505  OutgroupMatcher om(locs, pdc);
1506  for (PT_tprobes *tprobe = pdc->tprobes; tprobe; tprobe = tprobe->next) {
1507  om.calculate_outgroup_matches(*tprobe);
1508  ++progress;
1509  }
1510  }
1511  else {
1512  fputs("No probe candidates found\n", stdout);
1513  }
1514 
1517  clip_tprobes(pdc, pdc->clipresult);
1518  }
1519  }
1520  }
1521  }
1522 
1523  pt_export_error_if(locs, error);
1524  return 0;
1525 }
1526 
1527 // --------------------------------------------------------------------------------
1528 
1529 #ifdef UNIT_TESTS
1530 #ifndef TEST_UNIT_H
1531 #include <test_unit.h>
1532 #endif
1533 
1534 static const char *concat_iteration(PrefixIterator& prefix) {
1535  static GBS_strstruct out(50);
1536 
1537  out.erase();
1538 
1539  while (!prefix.done()) {
1540  if (out.filled()) out.put(',');
1541 
1542  char *readable = probe_2_readable(prefix.copy(), prefix.length());
1543  out.cat(readable);
1544  free(readable);
1545 
1546  ++prefix;
1547  }
1548 
1549  return out.get_data();
1550 }
1551 
1552 void TEST_PrefixIterator() {
1553  // straight-forward permutation
1554  PrefixIterator p0(PT_A, PT_T, 0); TEST_EXPECT_EQUAL(p0.steps(), 1);
1555  PrefixIterator p1(PT_A, PT_T, 1); TEST_EXPECT_EQUAL(p1.steps(), 4);
1556  PrefixIterator p2(PT_A, PT_T, 2); TEST_EXPECT_EQUAL(p2.steps(), 16);
1557  PrefixIterator p3(PT_A, PT_T, 3); TEST_EXPECT_EQUAL(p3.steps(), 64);
1558 
1559  TEST_EXPECT_EQUAL(p1.done(), false);
1560  TEST_EXPECT_EQUAL(p0.done(), false);
1561 
1562  TEST_EXPECT_EQUAL(concat_iteration(p0), "");
1563  TEST_EXPECT_EQUAL(concat_iteration(p1), "A,C,G,U");
1564  TEST_EXPECT_EQUAL(concat_iteration(p2), "AA,AC,AG,AU,CA,CC,CG,CU,GA,GC,GG,GU,UA,UC,UG,UU");
1565 
1566  // permutation truncated at PT_QU
1567  PrefixIterator q0(PT_QU, PT_T, 0); TEST_EXPECT_EQUAL(q0.steps(), 1);
1568  PrefixIterator q1(PT_QU, PT_T, 1); TEST_EXPECT_EQUAL(q1.steps(), 6);
1569  PrefixIterator q2(PT_QU, PT_T, 2); TEST_EXPECT_EQUAL(q2.steps(), 31);
1570  PrefixIterator q3(PT_QU, PT_T, 3); TEST_EXPECT_EQUAL(q3.steps(), 156);
1571  PrefixIterator q4(PT_QU, PT_T, 4); TEST_EXPECT_EQUAL(q4.steps(), 781);
1572 
1573  TEST_EXPECT_EQUAL(concat_iteration(q0), "");
1574  TEST_EXPECT_EQUAL(concat_iteration(q1), ".,N,A,C,G,U");
1575  TEST_EXPECT_EQUAL(concat_iteration(q2),
1576  ".,"
1577  "N.,NN,NA,NC,NG,NU,"
1578  "A.,AN,AA,AC,AG,AU,"
1579  "C.,CN,CA,CC,CG,CU,"
1580  "G.,GN,GA,GC,GG,GU,"
1581  "U.,UN,UA,UC,UG,UU");
1582  TEST_EXPECT_EQUAL(concat_iteration(q3),
1583  ".,"
1584  "N.,"
1585  "NN.,NNN,NNA,NNC,NNG,NNU,"
1586  "NA.,NAN,NAA,NAC,NAG,NAU,"
1587  "NC.,NCN,NCA,NCC,NCG,NCU,"
1588  "NG.,NGN,NGA,NGC,NGG,NGU,"
1589  "NU.,NUN,NUA,NUC,NUG,NUU,"
1590  "A.,"
1591  "AN.,ANN,ANA,ANC,ANG,ANU,"
1592  "AA.,AAN,AAA,AAC,AAG,AAU,"
1593  "AC.,ACN,ACA,ACC,ACG,ACU,"
1594  "AG.,AGN,AGA,AGC,AGG,AGU,"
1595  "AU.,AUN,AUA,AUC,AUG,AUU,"
1596  "C.,"
1597  "CN.,CNN,CNA,CNC,CNG,CNU,"
1598  "CA.,CAN,CAA,CAC,CAG,CAU,"
1599  "CC.,CCN,CCA,CCC,CCG,CCU,"
1600  "CG.,CGN,CGA,CGC,CGG,CGU,"
1601  "CU.,CUN,CUA,CUC,CUG,CUU,"
1602  "G.,"
1603  "GN.,GNN,GNA,GNC,GNG,GNU,"
1604  "GA.,GAN,GAA,GAC,GAG,GAU,"
1605  "GC.,GCN,GCA,GCC,GCG,GCU,"
1606  "GG.,GGN,GGA,GGC,GGG,GGU,"
1607  "GU.,GUN,GUA,GUC,GUG,GUU,"
1608  "U.,"
1609  "UN.,UNN,UNA,UNC,UNG,UNU,"
1610  "UA.,UAN,UAA,UAC,UAG,UAU,"
1611  "UC.,UCN,UCA,UCC,UCG,UCU,"
1612  "UG.,UGN,UGA,UGC,UGG,UGU,"
1613  "UU.,UUN,UUA,UUC,UUG,UUU");
1614 }
1615 
1616 #endif // UNIT_TESTS
1617 
1618 // --------------------------------------------------------------------------------
struct probe_input_data * data
Definition: probe.h:356
const char * aisc_unlink(dllheader_ext *object)
Definition: struct_man.c:185
long get_datasize() const
void calculate_outgroup_matches(PT_tprobes &tprobe)
const char * id
Definition: AliAdmin.cxx:17
group_matcher all()
Definition: test_unit.h:1011
Definition: probe.h:83
SmartCharPtr get_dataPtr() const
Definition: probe.h:165
size_t size() const
Definition: arb_strarray.h:85
bool less(const ProbeOccurrence &other, size_t len) const
void fflush_all()
Definition: PT_tools.h:211
static PD_Formatter4design format4design
char * copy() const
Definition: PT_prefixIter.h:92
void complement_probe(char *Probe, int len)
Definition: PT_complement.h:35
static int count_mishits_for_matched(char *probe, POS_TREE2 *pt, int height)
bool domainPossible() const
DesignTargets(PT_pdc *pdc_, int expected_known_targets)
void dump(FILE *out) const
std::map< const PT_pdc *const, PD_formatter > PD_Formatter4design
Definition: probe.h:84
void pt_export_error_if(PT_local *locs, ARB_ERROR &error)
Definition: PT_etc.cxx:22
void generate(ProbeCandidates &candidates)
void GB_sort(void **array, size_t first, size_t behind_last, gb_compare_function compare, void *client_data)
Definition: arb_sort.cxx:27
bool centigrade_pos_out_of_reach(const PT_tprobes *tprobe, const PT_pdc *const pdc, const Splits &splits) const
int PT_forwhole_chain(PT *node, T &func)
int get_name() const
Definition: probe_tree.h:434
static void clip_tprobes(PT_pdc *pdc, int count)
bool operator()(const SmartCharPtr &p1, const SmartCharPtr &p2)
PO_Less(size_t probelen_)
const probe_input_data & get_pid() const
Definition: probe_tree.h:519
probe_struct_global psg
Definition: PT_main.cxx:36
int domainLength() const
char * ARB_strdup(const char *str)
Definition: arb_string.h:27
Definition: probe.h:85
const ProbeOccurrence & occurrence() const
MatchingOligo dont_bind_rest() const
const char * GBS_global_string(const char *templat,...)
Definition: arb_msg.cxx:203
int get_size() const
Definition: probe.h:211
static bool is_valid_pos(int pos)
Oligo suffix(int skip) const
bool feasible(PT_pdc *pdc) const
bool domainSeen() const
int operator()(const AbsLoc &probeLoc)
int PT_start_design(PT_pdc *pdc, int)
size_t length() const
Definition: PT_prefixIter.h:89
char buffer[MESSAGE_BUFFERSIZE]
Definition: seq_search.cxx:34
void summarize_centigrade_hits(int *centigrade_hits) const
FILE * seq
Definition: rns.c:46
int calc_centigrade_pos(const PT_tprobes *tprobe, const PT_pdc *const pdc) const
double get_max_bond(int base) const
Definition: pt_split.h:42
int get_added_count() const
static void remove_tprobes_outside_ecoli_range(PT_pdc *pdc)
static FullNameMap names
const probe_input_data & get_pid() const
Definition: probe_tree.h:437
bool outside_group() const
Definition: probe.h:218
Oligo(const char *data_, int length_)
ProbeIterator(const char *seq, size_t seqlen, size_t probelen_)
MatchingOligo(const Oligo &oligo_)
int get_ecol_width() const
static size_t tprobes_calculate_bonds(PT_local *locs)
static int count_mishits_for_all(POS_TREE2 *pt)
GB_ERROR deliver() const
Definition: arb_error.h:116
int get_abs_pos() const
Definition: probe_tree.h:435
void aisc_link(dll_public *dll, PT_tprobes *tprobe)
size_t probe_compress_sequence(char *seq, size_t seqsize)
Definition: PT_io.cxx:78
bool more(const copy< T > &t1, const copy< T > &t2)
Definition: test_unit.h:646
CONSTEXPR_INLINE int digits(int parts)
void announce(int p)
Definition: probe.h:328
static void tprobes_sumup_perc_and_calc_quality(PT_pdc *pdc)
GB_BUFFER GB_give_buffer(size_t size)
Definition: arbdb.cxx:311
double sum_bonds
char * readable_probe(const char *compressed_probe, size_t len, char T_or_U)
Definition: PT_io.cxx:91
#define false
Definition: ureadseq.h:13
int sprint_centigrade_list(char *buffer, const int *perc) const
static void remove_tprobes_with_too_many_mishits(PT_pdc *pdc)
static void error(const char *msg)
Definition: mkptypes.cxx:96
char * get_design_hinfo(const PT_pdc *pdc)
const char * get_shortname() const
Definition: probe.h:170
long PT_abs_2_ecoli_rel(long pos)
Definition: PT_io.cxx:347
int get_most_used() const
Definition: probe.h:338
ProbeOccurrence(const char *seq_)
int get_known_count() const
const Oligo & get_oligo() const
ASSERTING_CONSTEXPR_INLINE int info2bio(int infopos)
Definition: arb_defs.h:27
static void sort_tprobes_by(PT_pdc *pdc, ProbeSortMode mode)
int get_apos_width() const
const AbsLoc & at() const
Definition: probe_tree.h:685
bool is_leaf() const
Definition: probe_tree.h:249
#define cmp(h1, h2)
Definition: admap.cxx:50
TYPE get_type() const
Definition: probe_tree.h:246
Definition: probe.h:86
CONSTEXPR_INLINE bool is_std_base_or_N(char b)
Definition: probe.h:94
int get_max_probelen(const PT_pdc *pdc)
int shown_apos(const PT_tprobes *tprobe)
Definition: probe.h:88
bool gc_in_wanted_range(PT_pdc *pdc) const
#define pt_assert(bed)
Definition: PT_tools.h:22
bool inside_group() const
Definition: probe.h:217
void generate_for_sequence(PT_pdc *pdc, const SmartCharPtr &seq, size_t bp)
OutgroupMatcher(PT_local *locs_, PT_pdc *pdc_)
int get_count() const
bool empty() const
PT * PT_read_son(PT *node, PT_base base)
static char hitgroup_idx2char(int idx)
bool is_node() const
Definition: probe_tree.h:248
ptnd_chain_count_mishits(const char *probe_, int height_)
char at(int offset) const
fputs(TRACE_PREFIX, stderr)
PT_base
Definition: probe.h:82
int shown_qual(const PT_tprobes *tprobe)
int size() const
int get_temperature() const
T_PT_LOCS locs
dt_bondssum(double dt_, double sum_bonds_)
str readable(const copy< T > &v)
Definition: test_unit.h:345
char * ARB_strndup(const char *start, int len)
Definition: arb_string.h:83
int gene_flag
Definition: PT_main.cxx:39
void set_pos_for(int name, int pos)
#define MIN_DESIGN_PROBE_LENGTH
const size_t BUFFERSIZE
PD_formatter(const PT_pdc *pdc)
GB_BUFFER GB_give_buffer2(long size)
Definition: arbdb.cxx:334
ProbeCandidates(size_t probelen_)
void erase_formatter(const PT_pdc *pdc)
POS_TREE2 *& TREE_ROOT2()
Definition: probe.h:390
char * probe_2_readable(char *id_string, int len)
Definition: probe.h:102
#define abs(x)
Definition: f2c.h:151
Definition: probe.h:89
void create_tprobes(PT_pdc *pdc, int ingroup_size)
#define DUMP_TPROBES(a, b)
int get_rel_pos() const
Definition: probe_tree.h:471
double check(char base, char ref) const
Definition: pt_split.h:114
#define NULp
Definition: cxxforward.h:116
bool done() const
ARB_ERROR & get_error()
static int ptnd_compare_sequence(const void *vtp1, const void *vtp2, void *)
#define offset(field)
Definition: GLwDrawA.c:73
int get_max_designed_len() const
char * ptpd_read_names(PT_local *locs, const char *names_list, const char *checksums, ARB_ERROR &error)
Definition: PT_etc.cxx:120
char dangling_char() const
static int ptnd_compare_quality(const void *vtp1, const void *vtp2, void *)
static int pt_get_gc_content(char *probe)
void clear()
Definition: probe.h:321
int dangling() const
const char * sequence() const
int pt_init_bond_matrix(PT_local *THIS)
MostUsedPos abs_pos
Definition: probe.h:368
char * create_reversed_probe(char *probe, int len)
Definition: PT_match.cxx:335
bool is_chain() const
Definition: probe_tree.h:250
int get_qual_width() const
#define min(a, b)
Definition: f2c.h:153
CONSTEXPR_INLINE int double_cmp(const double d1, const double d2)
Definition: arbtools.h:185
void GBT_splitNdestroy_string(ConstStrArray &names, char *&namelist, const char *separator, SplitMode mode)
bool temperature_in_wanted_range(PT_pdc *pdc) const
bool completely_bound() const
char * get_design_info(const PT_tprobes *tprobe)
bool operator()(const ProbeOccurrence &a, const ProbeOccurrence &b) const
int get_grps_width() const
#define TEST_EXPECT_EQUAL(expr, want)
Definition: test_unit.h:1294
ProbeSortMode
static double pt_get_temperature(const char *probe)
Definition: probe.h:87
char * GBS_global_string_copy(const char *templat,...)
Definition: arb_msg.cxx:194
MatchingOligo bind_against(char c, const Splits &splits) const
#define UNCOVERED()
Definition: arb_assert.h:380
const PD_formatter & get_formatter(const PT_pdc *pdc)
int shown_ecoli(const PT_tprobes *tprobe)
GB_write_int const char s
Definition: AW_awar.cxx:154
#define max(a, b)
Definition: f2c.h:154