ARB
PT_match.cxx
Go to the documentation of this file.
1 // =============================================================== //
2 // //
3 // File : PT_match.cxx //
4 // Purpose : //
5 // //
6 // Institute of Microbiology (Technical University Munich) //
7 // http://www.arb-home.de/ //
8 // //
9 // =============================================================== //
10 
11 #include "probe.h"
12 #include <PT_server_prototypes.h>
13 #include <struct_man.h>
14 
15 #include "pt_split.h"
16 #include "probe_tree.h"
17 
18 #include <arb_strbuf.h>
19 #include <arb_defs.h>
20 #include <arb_sort.h>
21 #include <cctype>
22 #include <map>
23 
24 // overloaded functions to avoid problems with type-punning:
25 inline void aisc_link(dll_public *dll, PT_probematch *match) { aisc_link(reinterpret_cast<dllpublic_ext*>(dll), reinterpret_cast<dllheader_ext*>(match)); }
26 
27 class MatchRequest;
28 class Mismatches {
29  MatchRequest& req;
30 
31  int plain; // plain mismatch between 2 standard bases
32  int ambig; // mismatch with N or '.' involved
33 
34  double weighted; // weighted mismatches
35 
36 public:
37 
38  Mismatches(MatchRequest& req_) : req(req_), plain(0), ambig(0), weighted(0.0) {}
39 
40  inline void count_weighted(char probe, char seq, int height);
41  void count_versus(const ReadableDataLoc& loc, const char *probe, int height);
42 
43  inline bool accepted() const;
44 
45  int get_plain() const { return plain; }
46  int get_ambig() const { return ambig; }
47 
48  double get_weighted() const { return weighted; }
49 
50  inline PT_local& get_PT_local() const;
51 };
52 
53 class MatchRequest : virtual Noncopyable {
54  PT_local& pt_local;
55 
56  int max_ambig; // max. possible ambiguous hits (i.e. max value in Mismatches::ambig)
57  int *accepted_N_mismatches;
58 
59  MismatchWeights weights;
60 
61  void init_accepted_N_mismatches(int ignored_Nmismatches, int when_less_than_Nmismatches);
62 
63 public:
64  explicit MatchRequest(PT_local& locs, int probe_length)
65  : pt_local(locs),
66  max_ambig(probe_length),
67  accepted_N_mismatches(new int[max_ambig+1]),
68  weights(locs.bond)
69  {
70  init_accepted_N_mismatches(pt_local.pm_nmatches_ignored, pt_local.pm_nmatches_limit);
71  }
73  delete [] accepted_N_mismatches;
74  }
75 
76  PT_local& get_PT_local() const { return pt_local; }
77 
78  bool hit_limit_reached() const {
79  bool reached = pt_local.pm_max_hits>0 && pt_local.ppm.cnt >= pt_local.pm_max_hits;
80  if (reached) pt_local.matches_truncated = 1;
81  return reached;
82  }
83 
84  int accept_N_mismatches(int ambig) const {
85  pt_assert(ambig<=max_ambig);
86  return accepted_N_mismatches[ambig];
87  }
88 
89  bool add_hit(const DataLoc& at, const Mismatches& mismatch);
90  bool add_hits_for_children(POS_TREE2 *pt, const Mismatches& mismatch);
91  bool collect_hits_for(const char *probe, POS_TREE2 *pt, Mismatches& mismatch, int height);
92 
93  int allowed_mismatches() const { return pt_local.pm_max; }
94  double get_mismatch_weight(char probe, char seq) const { return weights.get(probe, seq); }
95 };
96 
97 
98 
99 void MatchRequest::init_accepted_N_mismatches(int ignored_Nmismatches, int when_less_than_Nmismatches) {
100  // calculate table for PT_N mismatches
101  //
102  // 'ignored_Nmismatches' specifies, how many N-mismatches will be accepted as
103  // matches, when overall number of N-mismatches is below 'when_less_than_Nmismatches'.
104  //
105  // above that limit, every N-mismatch counts as mismatch
106 
107  when_less_than_Nmismatches = std::min(when_less_than_Nmismatches, max_ambig+1);
108  ignored_Nmismatches = std::min(ignored_Nmismatches, when_less_than_Nmismatches-1);
109 
110  accepted_N_mismatches[0] = 0;
111  int mm;
112  for (mm = 1; mm<when_less_than_Nmismatches; ++mm) { // LOOP_VECTORIZED=* (1 loop prior; 2 loops reported with 7.x)
113  accepted_N_mismatches[mm] = mm>ignored_Nmismatches ? mm-ignored_Nmismatches : 0;
114  }
115  pt_assert(mm <= (max_ambig+1));
116  for (; mm <= max_ambig; ++mm) {
117  accepted_N_mismatches[mm] = mm;
118  }
119 }
120 
121 inline void Mismatches::count_weighted(char probe, char seq, int height) {
122  bool is_ambig = is_ambig_base(probe) || is_ambig_base(seq);
123  if (is_ambig || probe != seq) {
124  if (is_ambig) ambig++; else plain++;
125  weighted += req.get_mismatch_weight(probe, seq) * psg.pos_to_weight[height];
126  }
127 }
128 
129 inline bool Mismatches::accepted() const {
130  if (get_PT_local().sort_by == PT_MATCH_TYPE_INTEGER) {
131  return (req.accept_N_mismatches(ambig)+plain) <= req.allowed_mismatches();
132  }
133  return weighted <= (req.allowed_mismatches()+0.5);
134 }
135 
136 inline PT_local& Mismatches::get_PT_local() const {
137  return req.get_PT_local();
138 }
139 
140 bool MatchRequest::add_hit(const DataLoc& at, const Mismatches& mismatch) {
141  PT_probematch *ml = create_PT_probematch();
142 
143  ml->name = at.get_name();
144  ml->b_pos = at.get_abs_pos();
145  ml->g_pos = -1;
146  ml->rpos = at.get_rel_pos();
147 
148  ml->mismatches = mismatch.get_plain() + accept_N_mismatches(mismatch.get_ambig());
149  ml->wmismatches = mismatch.get_weighted();
150  ml->N_mismatches = mismatch.get_ambig();
151 
152  ml->sequence = psg.main_probe;
153  ml->reversed = psg.reversed ? 1 : 0;
154 
155  aisc_link(&get_PT_local().ppm, ml);
156 
157  return hit_limit_reached();
158 }
159 
162 
163  pt_assert(pt && mismatch.accepted()); // invalid or superfluous call
165 
166  bool enough = false;
167  switch (pt->get_type()) {
168  case PT2_LEAF:
169  enough = add_hit(DataLoc(pt), mismatch);
170  break;
171 
172  case PT2_CHAIN: {
173  ChainIteratorStage2 entry(pt);
174  while (entry && !enough) {
175  enough = add_hit(DataLoc(entry.at()), mismatch);
176  ++entry;
177  }
178  break;
179  }
180  case PT2_NODE:
181  for (int base = PT_QU; base < PT_BASES && !enough; base++) {
182  POS_TREE2 *son = PT_read_son(pt, (PT_base)base);
183  if (son) enough = add_hits_for_children(son, mismatch);
184  }
185  break;
186  }
187  return enough;
188 }
189 
190 void Mismatches::count_versus(const ReadableDataLoc& loc, const char *probe, int height) {
191  int base;
192  while ((base = probe[height])) {
193  int ref = loc[height];
194  if (ref == PT_QU) break;
195 
196  count_weighted(base, ref, height);
197  height++;
198  }
199 
200  if (base != PT_QU) { // not end of probe
201  pt_assert(loc[height] == PT_QU); // at EOS
202  do {
203  count_weighted(base, PT_QU, height);
204  height++;
205  }
206  while ((base = probe[height]));
207  }
208 }
209 
210 bool MatchRequest::collect_hits_for(const char *probe, POS_TREE2 *pt, Mismatches& mismatches, const int height) {
212 
213  pt_assert(pt && mismatches.accepted()); // invalid or superfluous call
215 
216  bool enough = false;
217  if (probe[height] == PT_QU) {
218  enough = add_hits_for_children(pt, mismatches);
219  }
220  else {
221  switch (pt->get_type()) {
222  case PT2_LEAF: {
223  ReadableDataLoc loc(pt);
224  mismatches.count_versus(loc, probe, height);
225  if (mismatches.accepted()) {
226  enough = add_hit(loc, mismatches);
227  }
228  break;
229  }
230  case PT2_CHAIN: {
231  pt_assert(probe);
232 
233  ChainIteratorStage2 entry(pt);
234  while (entry && !enough) {
235  Mismatches entry_mismatches(mismatches);
236  DataLoc dloc(entry.at()); // @@@ EXPENSIVE_CONVERSION
237  entry_mismatches.count_versus(ReadableDataLoc(dloc), probe, height); // @@@ EXPENSIVE_CONVERSION
238  if (entry_mismatches.accepted()) {
239  enough = add_hit(dloc, entry_mismatches);
240  }
241  ++entry;
242  }
243  break;
244  }
245  case PT2_NODE:
246  for (int i=PT_QU; i<PT_BASES && !enough; i++) {
247  POS_TREE2 *son = PT_read_son(pt, (PT_base)i);
248  if (son) {
249  Mismatches son_mismatches(mismatches);
250  son_mismatches.count_weighted(probe[height], i, height);
251  if (son_mismatches.accepted()) {
252  if (i == PT_QU) {
253  // @@@ calculation here is constant for a fixed probe (cache results)
254  pt_assert(probe[height] != PT_QU);
255 
256  int son_height = height+1;
257  while (1) {
258  int base = probe[son_height];
259  if (base == PT_QU) {
260  if (son_mismatches.accepted()) {
261  enough = add_hits_for_children(son, son_mismatches);
262  }
263  break;
264  }
265 
266  son_mismatches.count_weighted(base, PT_QU, son_height);
267  if (!son_mismatches.accepted()) break;
268 
269  ++son_height;
270  }
271  }
272  else {
273  enough = collect_hits_for(probe, son, son_mismatches, height+1);
274  }
275  }
276  }
277  }
278  break;
279  }
280  }
281 
282  return enough;
283 }
284 
285 static int pt_sort_compare_match(const void *PT_probematch_ptr1, const void *PT_probematch_ptr2, void *) {
286  const PT_probematch *mach1 = (const PT_probematch*)PT_probematch_ptr1;
287  const PT_probematch *mach2 = (const PT_probematch*)PT_probematch_ptr2;
288 
290  if (mach1->wmismatches > mach2->wmismatches) return 1;
291  if (mach1->wmismatches < mach2->wmismatches) return -1;
292  }
293 
294  int cmp = mach1->mismatches - mach2->mismatches;
295  if (!cmp) {
296  cmp = mach1->N_mismatches - mach2->N_mismatches;
297  if (!cmp) {
298  if (mach1->wmismatches < mach2->wmismatches) cmp = -1;
299  else if (mach1->wmismatches > mach2->wmismatches) cmp = 1;
300  else {
301  cmp = mach1->b_pos - mach2->b_pos;
302  if (!cmp) {
303  cmp = mach1->name - mach2->name;
304  }
305  }
306  }
307  }
308 
309  return cmp;
310 }
311 
312 static void pt_sort_match_list(PT_local * locs) {
313  if (locs->pm) {
314  psg.sort_by = locs->sort_by;
315 
316  int list_len = locs->pm->get_count();
317  if (list_len > 1) {
318  PT_probematch **my_list; ARB_calloc(my_list, list_len);
319  {
320  PT_probematch *match = locs->pm;
321  for (int i=0; match; i++) {
322  my_list[i] = match;
323  match = match->next;
324  }
325  }
326  GB_sort((void **)my_list, 0, list_len, pt_sort_compare_match, NULp);
327  for (int i=0; i<list_len; i++) {
328  aisc_unlink((dllheader_ext*)my_list[i]);
329  aisc_link(&locs->ppm, my_list[i]);
330  }
331  free(my_list);
332  }
333  }
334 }
335 char *create_reversed_probe(char *probe, int len) {
337  char *rev_probe = ARB_strduplen(probe, len);
338  reverse_probe(rev_probe, len);
339  return rev_probe;
340 }
341 
342 CONSTEXPR_INLINE double calc_position_wmis(int pos, int seq_len, double y1, double y2) {
343  return (double)(((double)(pos * (seq_len - 1 - pos)) / (double)((seq_len - 1) * (seq_len - 1)))* (double)(y2*4.0) + y1);
344 }
345 
346 static void pt_build_pos_to_weight(PT_MATCH_TYPE type, const char *sequence) {
347  delete [] psg.pos_to_weight;
348  int slen = strlen(sequence);
349  psg.pos_to_weight = new double[slen+1];
350  int p;
351  for (p=0; p<slen; p++) { // LOOP_VECTORIZED=* (no idea why this is instantiated 4 times for gcc<8.1. inline would/does cause 2 instantiations, as encountered with gcc 8.1)
352  if (type == PT_MATCH_TYPE_WEIGHTED_PLUS_POS) {
353  psg.pos_to_weight[p] = calc_position_wmis(p, slen, 0.3, 1.0);
354  }
355  else {
356  psg.pos_to_weight[p] = 1.0;
357  }
358  }
359  psg.pos_to_weight[slen] = 0;
360 }
361 
362 static std::map<PT_local*,Splits> splits_for_match_overlay; // initialized by probe-match, used by match-retrieval (one entry for each match-request); @@@ leaks.. 1 entry for each request
363 
364 int probe_match(PT_local *locs, aisc_string probestring) {
366 
367  freedup(locs->pm_sequence, probestring);
368  psg.main_probe = locs->pm_sequence;
369 
370  compress_data(probestring);
371  while (PT_probematch *ml = locs->pm) destroy_PT_probematch(ml);
372  locs->matches_truncated = 0;
373 
374 #if defined(DEBUG) && 0
375  printf("Current bond values:\n");
376  for (int y = 0; y<4; y++) {
377  for (int x = 0; x<4; x++) {
378  printf("%5.2f", locs->bond[y*4+x].val);
379  }
380  printf("\n");
381  }
382 #endif // DEBUG
383 
384  int probe_len = strlen(probestring);
385  bool failed = false;
386  if (probe_len<MIN_PROBE_LENGTH) {
387  pt_export_error(locs, GBS_global_string("Min. probe length is %i", MIN_PROBE_LENGTH));
388  failed = true;
389  }
390  else {
391  int max_poss_mismatches = probe_len/2;
392  pt_assert(max_poss_mismatches>0);
393  if (locs->pm_max > max_poss_mismatches) {
394  pt_export_error(locs, GBS_global_string("Max. %i mismatch%s are allowed for probes of length %i",
395  max_poss_mismatches,
396  max_poss_mismatches == 1 ? "" : "es",
397  probe_len));
398  failed = true;
399  }
400  }
401 
402  if (!failed) {
403  if (locs->pm_complement_first) {
404  complement_probe(probestring, probe_len);
405  }
406  psg.reversed = 0;
407 
408  freedup(locs->pm_sequence, probestring);
409  psg.main_probe = locs->pm_sequence;
410 
411  pt_build_pos_to_weight((PT_MATCH_TYPE)locs->sort_by, probestring);
412 
413  MatchRequest req(*locs, probe_len);
414 
415  pt_assert(req.allowed_mismatches() >= 0); // till [8011] value<0 was used to trigger "new match" (feature unused)
416  Mismatches mismatch(req);
417  req.collect_hits_for(probestring, psg.TREE_ROOT2(), mismatch, 0);
418 
419  if (locs->pm_also_revcomp) {
420  psg.reversed = 1;
421  char *rev_pro = create_reversed_probe(probestring, probe_len);
422  complement_probe(rev_pro, probe_len);
423  freeset(locs->pm_csequence, psg.main_probe = ARB_strdup(rev_pro));
424 
425  Mismatches rev_mismatch(req);
426  req.collect_hits_for(rev_pro, psg.TREE_ROOT2(), rev_mismatch, 0);
427  free(rev_pro);
428  }
429  pt_sort_match_list(locs);
430  splits_for_match_overlay[locs] = Splits(locs);
431  }
432  free(probestring);
433 
434  return 0;
435 }
436 
437 struct format_props {
438  bool show_mismatches; // whether to show 'mis' and 'N_mis'
439  bool show_ecoli; // whether to show 'ecoli' column
440  bool show_gpos; // whether to show 'gpos' column
441 
442  int name_width; // width of 'name' column
443  int gene_or_full_width; // width of 'genename' or 'fullname' column
444  int pos_width; // max. width of pos column
445  int gpos_width; // max. width of gpos column
446  int ecoli_width; // max. width of ecoli column
447 
448  int rev_width() const { return 3; }
449  int mis_width() const { return 3; }
450  int N_mis_width() const { return 5; }
451  int wmis_width() const { return 4; }
452 };
453 
454 inline void set_max(const char *str, int &curr_max) {
455  if (str) {
456  int len = strlen(str);
457  if (len>curr_max) curr_max = len;
458  }
459 }
460 
461 static format_props detect_format_props(const PT_local *locs, bool show_gpos) {
462  PT_probematch *ml = locs->pm; // probe matches
464 
465  format.show_mismatches = (ml->N_mismatches >= 0);
466  format.show_ecoli = psg.ecoli; // display only if there is ecoli
467  format.show_gpos = show_gpos; // display only for gene probe matches
468 
469  // minimum values (caused by header widths) :
470  format.name_width = gene_flag ? 8 : 4; // 'organism' or 'name'
471  format.gene_or_full_width = 8; // 'genename' or 'fullname'
472  format.pos_width = 3; // 'pos'
473  format.gpos_width = 4; // 'gpos'
474  format.ecoli_width = 5; // 'ecoli'
475 
476  for (; ml; ml = ml->next) {
477  set_max(virt_name(ml), format.name_width);
479  set_max(GBS_global_string("%i", info2bio(ml->b_pos)), format.pos_width);
480  if (show_gpos) set_max(GBS_global_string("%i", info2bio(ml->g_pos)), format.gpos_width);
481  if (format.show_ecoli) set_max(GBS_global_string("%li", PT_abs_2_ecoli_rel(ml->b_pos+1)), format.ecoli_width);
482  }
483 
484  return format;
485 }
486 
487 inline void cat_internal(GBS_strstruct *memfile, int len, const char *text, int width, char spacer, bool align_left) {
488  if (len == width) {
489  GBS_strcat(memfile, text); // text has exact len
490  }
491  else if (len > width) { // text to long
492  char buf[width+1];
493  memcpy(buf, text, width);
494  buf[width] = 0;
495  GBS_strcat(memfile, buf);
496  }
497  else { // text is too short -> insert spaces
498  int spaces = width-len;
499  pt_assert(spaces>0);
500  char sp[spaces+1];
501  memset(sp, spacer, spaces);
502  sp[spaces] = 0;
503 
504  if (align_left) {
505  GBS_strcat(memfile, text);
506  GBS_strcat(memfile, sp);
507  }
508  else {
509  GBS_strcat(memfile, sp);
510  GBS_strcat(memfile, text);
511  }
512  }
513  GBS_chrcat(memfile, ' '); // one space behind each column
514 }
515 inline void cat_spaced_left (GBS_strstruct *memfile, const char *text, int width) { cat_internal(memfile, strlen(text), text, width, ' ', true); }
516 inline void cat_spaced_right(GBS_strstruct *memfile, const char *text, int width) { cat_internal(memfile, strlen(text), text, width, ' ', false); }
517 inline void cat_dashed_left (GBS_strstruct *memfile, const char *text, int width) { cat_internal(memfile, strlen(text), text, width, '-', true); }
518 inline void cat_dashed_right(GBS_strstruct *memfile, const char *text, int width) { cat_internal(memfile, strlen(text), text, width, '-', false); }
519 
520 const char *get_match_overlay(const PT_probematch *ml) {
521  int pr_len = strlen(ml->sequence);
522  PT_local *locs = (PT_local *)ml->mh.parent->parent;
523 
524  const int CONTEXT_SIZE = 9;
525 
526  char *ref = ARB_calloc<char>(CONTEXT_SIZE+1+pr_len+1+CONTEXT_SIZE+1);
527  memset(ref, '.', CONTEXT_SIZE+1);
528 
529  SmartCharPtr seqPtr = psg.data[ml->name].get_dataPtr();
530  const char *seq = &*seqPtr;
531 
532  const Splits& splits = splits_for_match_overlay[locs];
533 
534  for (int pr_pos = CONTEXT_SIZE-1, al_pos = ml->rpos-1;
535  pr_pos >= 0 && al_pos >= 0;
536  pr_pos--, al_pos--)
537  {
538  if (!seq[al_pos]) break;
539  ref[pr_pos] = base_2_readable(seq[al_pos]);
540  }
541  ref[CONTEXT_SIZE] = '-';
542 
543  pt_build_pos_to_weight((PT_MATCH_TYPE)locs->sort_by, ml->sequence);
544 
545  bool display_right_context = true;
546  {
547  char *pref = ref+CONTEXT_SIZE+1;
548 
549  for (int pr_pos = 0, al_pos = ml->rpos;
550  pr_pos < pr_len && al_pos < psg.data[ml->name].get_size();
551  pr_pos++, al_pos++)
552  {
553  int ps = ml->sequence[pr_pos]; // probe sequence
554  int ts = seq[al_pos]; // target sequence (hit)
555  if (ps == ts) {
556  pref[pr_pos] = '=';
557  }
558  else {
559  if (ts) {
560  int r = base_2_readable(ts);
561  if (is_std_base(ps) && is_std_base(ts)) {
562  double h = splits.check(ml->sequence[pr_pos], ts);
563  if (h>=0.0) r = tolower(r); // if mismatch does not split probe into two domains -> show as lowercase
564  }
565  pref[pr_pos] = r;
566  }
567  else {
568  // end of sequence or missing data (dots inside sequence) reached
569  // (rest of probe was accepted by N-matches)
570  display_right_context = false;
571  for (; pr_pos < pr_len; pr_pos++) { // LOOP_VECTORIZED[!<7,!>=910<=930] // only worked with 8.x series
572  pref[pr_pos] = '.';
573  }
574  }
575  }
576 
577  }
578  }
579 
580  {
581  char *cref = ref+CONTEXT_SIZE+1+pr_len+1;
582  cref[-1] = '-';
583 
584  int al_size = psg.data[ml->name].get_size();
585  int al_pos = ml->rpos+pr_len;
586 
587  if (display_right_context) {
588  for (int pr_pos = 0;
589  pr_pos < CONTEXT_SIZE && al_pos < al_size;
590  pr_pos++, al_pos++)
591  {
592  cref[pr_pos] = base_2_readable(seq[al_pos]);
593  }
594  }
595  else {
596  if (al_pos < al_size) strcpy(cref, "<more>");
597  }
598  }
599 
600  static char *result = NULp;
601  freeset(result, ref);
602  return result;
603 }
604 
605 const char* get_match_acc(const PT_probematch *ml) {
606  return psg.data[ml->name].get_acc();
607 }
608 int get_match_start(const PT_probematch *ml) {
609  return psg.data[ml->name].get_start();
610 }
611 int get_match_stop(const PT_probematch *ml) {
612  return psg.data[ml->name].get_stop();
613 }
614 
615 static const char *get_match_info_formatted(PT_probematch *ml, const format_props& format) {
616  GBS_strstruct *memfile = GBS_stropen(256);
617  GBS_strcat(memfile, " ");
618 
619  cat_spaced_left(memfile, virt_name(ml), format.name_width);
620  cat_spaced_left(memfile, virt_fullname(ml), format.gene_or_full_width);
621 
622  if (format.show_mismatches) {
623  cat_spaced_right(memfile, GBS_global_string("%i", ml->mismatches), format.mis_width());
624  cat_spaced_right(memfile, GBS_global_string("%i", ml->N_mismatches), format.N_mis_width());
625  }
626  cat_spaced_right(memfile, GBS_global_string("%.1f", ml->wmismatches), format.wmis_width());
627  cat_spaced_right(memfile, GBS_global_string("%i", info2bio(ml->b_pos)), format.pos_width);
628  if (format.show_gpos) {
629  cat_spaced_right(memfile, GBS_global_string("%i", info2bio(ml->g_pos)), format.gpos_width);
630  }
631  if (format.show_ecoli) {
632  cat_spaced_right(memfile, GBS_global_string("%li", PT_abs_2_ecoli_rel(ml->b_pos+1)), format.ecoli_width);
633  }
634  cat_spaced_left(memfile, GBS_global_string("%i", ml->reversed), format.rev_width());
635 
636  GBS_strcat(memfile, get_match_overlay(ml));
637 
638  static char *result = NULp;
639  freeset(result, GBS_strclose(memfile));
640  return result;
641 }
642 
643 static const char *get_match_hinfo_formatted(PT_probematch *ml, const format_props& format) {
644  if (ml) {
645  GBS_strstruct *memfile = GBS_stropen(500);
646  GBS_strcat(memfile, " "); // one space more than in get_match_info_formatted()
647 
648  cat_dashed_left(memfile, gene_flag ? "organism" : "name", format.name_width);
649  cat_dashed_left(memfile, gene_flag ? "genename" : "fullname", format.gene_or_full_width);
650 
651  if (format.show_mismatches) {
652  cat_dashed_right(memfile, "mis", format.mis_width());
653  cat_dashed_right(memfile, "N_mis", format.N_mis_width());
654  }
655  cat_dashed_right(memfile, "wmis", format.wmis_width());
656  cat_dashed_right(memfile, "pos", format.pos_width);
657  if (format.show_gpos) {
658  cat_dashed_right(memfile, "gpos", format.gpos_width);
659  }
660  if (format.show_ecoli) {
661  cat_dashed_right(memfile, "ecoli", format.ecoli_width);
662  }
663  cat_dashed_left(memfile, "rev", format.rev_width());
664 
665  if (ml->N_mismatches >= 0) { //
666  char *seq = ARB_strdup(ml->sequence);
667  probe_2_readable(seq, strlen(ml->sequence)); // @@@ maybe wrong if match contains PT_QU (see [9070])
668 
669  GBS_strcat(memfile, " '");
670  GBS_strcat(memfile, seq);
671  GBS_strcat(memfile, "'");
672 
673  free(seq);
674  }
675 
676  static char *result = NULp;
677  freeset(result, GBS_strclose(memfile));
678  return result;
679  }
680  // Else set header of result
681  return "There are no targets";
682 }
683 
684 static void gene_rel_2_abs(PT_probematch *ml) {
690 
691  for (; ml; ml = ml->next) {
692  long gene_pos = psg.data[ml->name].get_geneabspos();
693  if (gene_pos >= 0) {
694  ml->g_pos = ml->b_pos;
695  ml->b_pos += gene_pos;
696  }
697  else {
698  fprintf(stderr, "Error in gene-pt-server: gene w/o position info\n");
699  pt_assert(0);
700  }
701  }
702 }
703 
704 bytestring *match_string(const PT_local *locs) {
712  static bytestring bs = { NULp, 0 };
713  GBS_strstruct *memfile;
714  PT_probematch *ml;
715 
716  free(bs.data);
717 
718  char empty[] = "";
719  bs.data = empty;
720  memfile = GBS_stropen(50000);
721 
722  if (locs->pm) {
723  if (gene_flag) gene_rel_2_abs(locs->pm);
724 
726 
727  GBS_strcat(memfile, get_match_hinfo_formatted(locs->pm, format));
728  GBS_chrcat(memfile, (char)1);
729 
730  for (ml = locs->pm; ml; ml = ml->next) {
731  GBS_strcat(memfile, virt_name(ml));
732  GBS_chrcat(memfile, (char)1);
733  GBS_strcat(memfile, get_match_info_formatted(ml, format));
734  GBS_chrcat(memfile, (char)1);
735  }
736  }
737 
738  bs.size = GBS_memoffset(memfile)+1;
739  bs.data = GBS_strclose(memfile);
740  return &bs;
741 }
742 
743 
744 
745 
746 bytestring *MP_match_string(const PT_local *locs) {
754  static bytestring bs = { NULp, 0 };
755 
756  char buffer[50];
757  char buffer1[50];
758  GBS_strstruct *memfile;
759  PT_probematch *ml;
760 
761  delete bs.data;
762  bs.data = NULp;
763  memfile = GBS_stropen(50000);
764 
765  for (ml = locs->pm; ml; ml = ml->next) {
766  GBS_strcat(memfile, virt_name(ml));
767  GBS_chrcat(memfile, (char)1);
768  sprintf(buffer, "%2i", ml->mismatches);
769  GBS_strcat(memfile, buffer);
770  GBS_chrcat(memfile, (char)1);
771  sprintf(buffer1, "%1.1f", ml->wmismatches);
772  GBS_strcat(memfile, buffer1);
773  GBS_chrcat(memfile, (char)1);
774  }
775  bs.size = GBS_memoffset(memfile)+1;
776  bs.data = GBS_strclose(memfile);
777  return &bs;
778 }
779 
780 
781 bytestring *MP_all_species_string(const PT_local *) {
789  static bytestring bs = { NULp, 0 };
790 
791  GBS_strstruct *memfile;
792  int i;
793 
794  delete bs.data;
795  bs.data = NULp;
796  memfile = GBS_stropen(1000);
797 
798  for (i = 0; i < psg.data_count; i++) {
799  GBS_strcat(memfile, psg.data[i].get_shortname());
800  GBS_chrcat(memfile, (char)1);
801  }
802 
803  bs.size = GBS_memoffset(memfile)+1;
804  bs.data = GBS_strclose(memfile);
805  return &bs;
806 }
807 
808 int MP_count_all_species(const PT_local *) {
809  return psg.data_count;
810 }
811 
812 // --------------------------------------------------------------------------------
813 
814 #ifdef UNIT_TESTS
815 #ifndef TEST_UNIT_H
816 #include <test_unit.h>
817 #endif
818 
819 struct EnterStage2 {
820  EnterStage2() {
821  PT_init_psg();
823  }
824  ~EnterStage2() {
825  PT_exit_psg();
826  }
827 };
828 
829 #define TEST_WEIGHTED_MISMATCH(probe,seq,expected) TEST_EXPECT_SIMILAR(weights.get(probe,seq), expected, EPS)
830 
831 void TEST_weighted_mismatches() {
832  EnterStage2 stage2;
833  PT_bond bonds[16] = {
834  { 0.0 }, { 0.0 }, { 0.5 }, { 1.1 },
835  { 0.0 }, { 0.0 }, { 1.5 }, { 0.0 },
836  { 0.5 }, { 1.5 }, { 0.4 }, { 0.9 },
837  { 1.1 }, { 0.0 }, { 0.9 }, { 0.0 },
838  };
839 
840  MismatchWeights weights(bonds);
841 
842  double EPS = 0.0001;
843 
844  TEST_WEIGHTED_MISMATCH(PT_A, PT_A, 0.0);
845  TEST_WEIGHTED_MISMATCH(PT_A, PT_C, 1.1); // (T~A = 1.1) - (T~C = 0)
846  TEST_WEIGHTED_MISMATCH(PT_A, PT_G, 0.2); // (T~A = 1.1) - (T~G = 0.9)
847  TEST_WEIGHTED_MISMATCH(PT_A, PT_T, 1.1);
848 
849  TEST_WEIGHTED_MISMATCH(PT_C, PT_A, 1.0);
850  TEST_WEIGHTED_MISMATCH(PT_C, PT_C, 0.0);
851  TEST_WEIGHTED_MISMATCH(PT_C, PT_G, 1.1);
852  TEST_WEIGHTED_MISMATCH(PT_C, PT_T, 0.6); // (G~C = 1.5) - (G~T = 0.9)
853 
854  TEST_WEIGHTED_MISMATCH(PT_G, PT_A, 1.5);
855  TEST_WEIGHTED_MISMATCH(PT_G, PT_C, 1.5);
856  TEST_WEIGHTED_MISMATCH(PT_G, PT_G, 0.0);
857  TEST_WEIGHTED_MISMATCH(PT_G, PT_T, 1.5);
858 
859  TEST_WEIGHTED_MISMATCH(PT_T, PT_A, 1.1);
860  TEST_WEIGHTED_MISMATCH(PT_T, PT_C, 1.1);
861  TEST_WEIGHTED_MISMATCH(PT_T, PT_G, 0.6);
862  TEST_WEIGHTED_MISMATCH(PT_T, PT_T, 0.0);
863 
864 
865  TEST_WEIGHTED_MISMATCH(PT_N, PT_A, 0.9);
866  TEST_WEIGHTED_MISMATCH(PT_N, PT_C, 0.925);
867  TEST_WEIGHTED_MISMATCH(PT_N, PT_G, 0.475);
868  TEST_WEIGHTED_MISMATCH(PT_N, PT_T, 0.8);
869 
870  TEST_WEIGHTED_MISMATCH(PT_A, PT_N, 0.6);
871  TEST_WEIGHTED_MISMATCH(PT_C, PT_N, 0.675);
872  TEST_WEIGHTED_MISMATCH(PT_G, PT_N, 1.125);
873  TEST_WEIGHTED_MISMATCH(PT_T, PT_N, 0.7);
874 
875  TEST_WEIGHTED_MISMATCH(PT_N, PT_N, 0.775);
876  TEST_WEIGHTED_MISMATCH(PT_QU, PT_QU, 0.775);
877  TEST_WEIGHTED_MISMATCH(PT_QU, PT_N, 0.775);
878 }
879 
880 #endif // UNIT_TESTS
881 
882 // --------------------------------------------------------------------------------
883 
884 
885 
struct probe_input_data * data
Definition: probe.h:356
const char * aisc_unlink(dllheader_ext *object)
Definition: struct_man.c:185
bool show_mismatches
Definition: PT_match.cxx:438
int get_stop() const
Definition: probe.h:197
string result
GB_TYPES type
void PT_init_psg()
Definition: PT_main.cxx:131
MatchRequest(PT_local &locs, int probe_length)
Definition: PT_match.cxx:64
void cat_spaced_right(GBS_strstruct *memfile, const char *text, int width)
Definition: PT_match.cxx:516
Definition: probe.h:83
SmartCharPtr get_dataPtr() const
Definition: probe.h:165
#define MIN_PROBE_LENGTH
Definition: probe.h:59
AliDataPtr format(AliDataPtr data, const size_t wanted_len, GB_ERROR &error)
Definition: insdel.cxx:615
void complement_probe(char *Probe, int len)
Definition: PT_complement.h:35
static char * y[maxsp+1]
bytestring * match_string(const PT_local *locs)
Definition: PT_match.cxx:704
bool add_hits_for_children(POS_TREE2 *pt, const Mismatches &mismatch)
Definition: PT_match.cxx:160
Definition: probe.h:84
void GB_sort(void **array, size_t first, size_t behind_last, gb_compare_function compare, void *client_data)
Definition: arb_sort.cxx:27
bool show_gpos
Definition: PT_match.cxx:440
bool show_ecoli
Definition: PT_match.cxx:439
int get_name() const
Definition: probe_tree.h:434
probe_struct_global psg
Definition: PT_main.cxx:36
char * ARB_strdup(const char *str)
Definition: arb_string.h:27
Definition: probe.h:85
GBDATA * gb_main
Definition: probe.h:351
static const char * get_match_info_formatted(PT_probematch *ml, const format_props &format)
Definition: PT_match.cxx:615
const char * GBS_global_string(const char *templat,...)
Definition: arb_msg.cxx:204
int get_size() const
Definition: probe.h:211
void cat_dashed_right(GBS_strstruct *memfile, const char *text, int width)
Definition: PT_match.cxx:518
PT_MATCH_TYPE
Definition: probe.h:61
void cat_dashed_left(GBS_strstruct *memfile, const char *text, int width)
Definition: PT_match.cxx:517
static void pt_sort_match_list(PT_local *locs)
Definition: PT_match.cxx:312
int MP_count_all_species(const PT_local *)
Definition: PT_match.cxx:808
bool collect_hits_for(const char *probe, POS_TREE2 *pt, Mismatches &mismatch, int height)
Definition: PT_match.cxx:210
int get_match_start(const PT_probematch *ml)
Definition: PT_match.cxx:608
const char * get_match_acc(const PT_probematch *ml)
Definition: PT_match.cxx:605
static format_props detect_format_props(const PT_local *locs, bool show_gpos)
Definition: PT_match.cxx:461
const char * virt_name(const PT_probematch *ml)
Definition: PT_etc.cxx:57
char buffer[MESSAGE_BUFFERSIZE]
Definition: seq_search.cxx:34
FILE * seq
Definition: rns.c:46
void reverse_probe(char *seq, int len)
Definition: probe.h:112
void enter_stage(Stage stage_)
int get_match_stop(const PT_probematch *ml)
Definition: PT_match.cxx:611
bool accepted() const
Definition: PT_match.cxx:129
int wmis_width() const
Definition: PT_match.cxx:451
static const char * get_match_hinfo_formatted(PT_probematch *ml, const format_props &format)
Definition: PT_match.cxx:643
GBS_strstruct * GBS_stropen(long init_size)
Definition: arb_strbuf.cxx:39
void PT_exit_psg()
Definition: PT_main.cxx:137
char * ARB_strduplen(const char *p, unsigned len)
Definition: arb_string.h:33
int gene_or_full_width
Definition: PT_match.cxx:443
int get_abs_pos() const
Definition: probe_tree.h:435
static int pt_sort_compare_match(const void *PT_probematch_ptr1, const void *PT_probematch_ptr2, void *)
Definition: PT_match.cxx:285
Definition: probe.h:122
double * pos_to_weight
Definition: probe.h:366
char * data
Definition: bytestring.h:16
int get_start() const
Definition: probe.h:191
void cat_spaced_left(GBS_strstruct *memfile, const char *text, int width)
Definition: PT_match.cxx:515
int probe_match(PT_local *locs, aisc_string probestring)
Definition: PT_match.cxx:364
static int weights[MAX_BASETYPES][MAX_BASETYPES]
Definition: ClustalV.cxx:71
char * main_probe
Definition: probe.h:372
long GBS_memoffset(GBS_strstruct *strstr)
Definition: arb_strbuf.cxx:91
int compress_data(char *probestring)
Definition: PT_io.cxx:21
PT_local & get_PT_local() const
Definition: PT_match.cxx:136
void GBS_strcat(GBS_strstruct *strstr, const char *ptr)
Definition: arb_strbuf.cxx:108
const char * get_shortname() const
Definition: probe.h:170
long PT_abs_2_ecoli_rel(long pos)
Definition: PT_io.cxx:341
bool add_hit(const DataLoc &at, const Mismatches &mismatch)
Definition: PT_match.cxx:140
CONSTEXPR_INLINE bool is_std_base(char b)
Definition: probe.h:93
#define CONSTEXPR_INLINE
Definition: cxxforward.h:92
int N_mis_width() const
Definition: PT_match.cxx:450
ASSERTING_CONSTEXPR_INLINE int info2bio(int infopos)
Definition: arb_defs.h:27
static std::map< PT_local *, Splits > splits_for_match_overlay
Definition: PT_match.cxx:362
const char * virt_fullname(const PT_probematch *ml)
Definition: PT_etc.cxx:69
int rev_width() const
Definition: PT_match.cxx:448
int mis_width() const
Definition: PT_match.cxx:449
const AbsLoc & at() const
Definition: probe_tree.h:685
#define cmp(h1, h2)
Definition: admap.cxx:50
bytestring * MP_all_species_string(const PT_local *)
Definition: PT_match.cxx:781
TYPE get_type() const
Definition: probe_tree.h:246
Definition: probe.h:86
Definition: probe.h:88
#define pt_assert(bed)
Definition: PT_tools.h:22
const char * get_match_overlay(const PT_probematch *ml)
Definition: PT_match.cxx:520
double get_weighted() const
Definition: PT_match.cxx:48
void count_weighted(char probe, char seq, int height)
Definition: PT_match.cxx:121
PT * PT_read_son(PT *node, PT_base base)
void count_versus(const ReadableDataLoc &loc, const char *probe, int height)
Definition: PT_match.cxx:190
#define EPS
Definition: awt_canvas.hxx:291
void GBS_chrcat(GBS_strstruct *strstr, char ch)
Definition: arb_strbuf.cxx:119
CONSTEXPR_INLINE char base_2_readable(char base)
Definition: probe.h:98
int accept_N_mismatches(int ambig) const
Definition: PT_match.cxx:84
Mismatches(MatchRequest &req_)
Definition: PT_match.cxx:38
PT_base
Definition: probe.h:82
PT_local & get_PT_local() const
Definition: PT_match.cxx:76
TYPE * ARB_calloc(size_t nelem)
Definition: arb_mem.h:81
static void pt_build_pos_to_weight(PT_MATCH_TYPE type, const char *sequence)
Definition: PT_match.cxx:346
void pt_export_error(PT_local *locs, const char *error)
Definition: PT_etc.cxx:19
T_PT_LOCS locs
int gene_flag
Definition: PT_main.cxx:39
int get_plain() const
Definition: PT_match.cxx:45
char * GBS_strclose(GBS_strstruct *strstr)
Definition: arb_strbuf.cxx:69
POS_TREE2 *& TREE_ROOT2()
Definition: probe.h:390
char * probe_2_readable(char *id_string, int len)
Definition: probe.h:102
double get(int probe, int seq) const
Definition: pt_split.h:82
Definition: probe.h:89
long get_geneabspos() const
Definition: probe.h:222
static void gene_rel_2_abs(PT_probematch *ml)
Definition: PT_match.cxx:684
void aisc_link(dll_public *dll, PT_probematch *match)
Definition: PT_match.cxx:25
int get_rel_pos() const
Definition: probe_tree.h:471
double check(char base, char ref) const
Definition: pt_split.h:114
#define NULp
Definition: cxxforward.h:97
const char * get_acc() const
Definition: probe.h:184
double get_mismatch_weight(char probe, char seq) const
Definition: PT_match.cxx:94
bool hit_limit_reached() const
Definition: PT_match.cxx:78
GB_transaction ta(gb_var)
int get_ambig() const
Definition: PT_match.cxx:46
int allowed_mismatches() const
Definition: PT_match.cxx:93
char * create_reversed_probe(char *probe, int len)
Definition: PT_match.cxx:335
#define min(a, b)
Definition: f2c.h:153
CONSTEXPR_INLINE bool is_ambig_base(char b)
Definition: probe.h:95
bytestring * MP_match_string(const PT_local *locs)
Definition: PT_match.cxx:746
void set_max(const char *str, int &curr_max)
Definition: PT_match.cxx:454
Definition: probe.h:87
CONSTEXPR_INLINE double calc_position_wmis(int pos, int seq_len, double y1, double y2)
Definition: PT_match.cxx:342
void cat_internal(GBS_strstruct *memfile, int len, const char *text, int width, char spacer, bool align_left)
Definition: PT_match.cxx:487