ARB
MG_adapt_ali.cxx
Go to the documentation of this file.
1 // =============================================================== //
2 // //
3 // File : MG_adapt_ali.cxx //
4 // Purpose : //
5 // //
6 // Institute of Microbiology (Technical University Munich) //
7 // http://www.arb-home.de/ //
8 // //
9 // =============================================================== //
10 
11 #include "merge.hxx"
12 #include "MG_adapt_ali.hxx"
13 
14 #include <aw_msg.hxx>
15 #include <arbdbt.h>
16 #include <arb_strbuf.h>
17 #include <arb_global_defs.h>
18 
19 #include <cmath>
20 #include <list>
21 
22 #if defined(DEBUG)
23 // #define DUMP_MAPPING
24 // #define DUMP_SOFTMAPPING
25 #endif // DEBUG
26 
27 // -----------------
28 // MG_remap
29 
30 const int NO_POSITION = -1;
31 const int LEFT_BORDER = -2;
32 const int RIGHT_BORDER = -3;
33 
34 struct softbase {
35  char base;
36  int origin; // position in source alignment
37  char last_gapchar; // last gap seen before base
38  int targetpos; // target position
39 
40  softbase(char base_, int origin_, char last_gapchar_) :
41  base(base_),
42  origin(origin_),
43  last_gapchar(last_gapchar_),
44  targetpos(NO_POSITION)
45  {
46  mg_assert(last_gapchar);
47  }
48 
49  operator char () const { return base; }
50 };
51 
52 typedef std::list<softbase> softbaselist;
53 typedef softbaselist::iterator softbaseiter;
54 typedef softbaselist::const_iterator const_softbaseiter;
55 
56 class MG_remap : virtual Noncopyable { // @@@ rename MG_remap -> AliRemapper
57  int in_length;
58  int out_length;
59  int *remap_tab; // fixed mapping (targetPosition or NO_POSITION)
60 
61  // soft-mapping:
62  mutable int *soft_remap_tab; // soft-mapping (NO_POSITION, targetPosition, LEFT_BORDER or RIGHT_BORDER)
63 
64  char *calc_softmapping(softbaselist& softbases, int start, int end, int &outlen) const;
65  int softmap_to(softbaselist& softbases, int start, int end, GBS_strstruct& outs) const;
66 
67  bool have_softmapping() const { return soft_remap_tab; }
68  void create_softmapping() const;
69  void forget_softmapping() const {
70  delete [] soft_remap_tab;
71  soft_remap_tab = NULp;
72  }
73 
74  static int *build_initial_mapping(const char *iref, int ilen, const char *oref, int olen);
75  void merge_mapping(MG_remap &other, int& inconsistent, int& added);
76 
77 public:
78 
80  in_length(0),
81  out_length(0),
82  remap_tab(NULp),
83  soft_remap_tab(NULp)
84  {}
86  forget_softmapping();
87  delete [] remap_tab;
88  }
89 
90  const char *add_reference(const char *in_reference, const char *out_reference); // returns only warnings
91  char *remap(const char *sequence) const; // returns 0 on error, else copy of sequence
92 
93 #if defined(DUMP_MAPPING)
94  static void dump(const int *data, int len, const char *comment, int dontShow) {
95  fflush(stdout);
96  fflush(stderr);
97  fputc('>', stdout);
98  int digits = calc_signed_digits(len);
99  for (int pos = 0; pos<len; ++pos) {
100  if (data[pos] == dontShow) {
101  fprintf(stdout, "%*s", digits, "_");
102  }
103  else {
104  fprintf(stdout, "%*i", digits, data[pos]);
105  }
106  }
107  fprintf(stdout, " (%s)\n", comment);
108  fflush(stdout);
109  }
110  void dump_remap(const char *comment) { dump(remap_tab, in_length, comment, NO_POSITION); }
111 #endif // DUMP_MAPPING
112 };
113 
114 int *MG_remap::build_initial_mapping(const char *iref, int ilen, const char *oref, int olen) {
115  int *remap = new int[ilen];
116 
117  const char *spacers = "-. n";
118 
119  int ipos = 0;
120  int opos = 0;
121 
122  while (ipos<ilen && opos<olen) {
123  size_t ispaces = strspn(iref+ipos, spacers);
124  size_t ospaces = strspn(oref+opos, spacers);
125 
126  while (ispaces && ipos<ilen) {
127  remap[ipos++] = NO_POSITION;
128  ispaces--;
129  }
130  opos += ospaces;
131  if (ipos<ilen && opos<olen) remap[ipos++] = opos++;
132  }
133  while (ipos<ilen) remap[ipos++] = NO_POSITION;
134 
135  return remap;
136 }
137 
138 #if defined(DEBUG)
139 // #define DUMP_INCONSISTENCIES
140 #endif // DEBUG
141 
142 
143 void MG_remap::merge_mapping(MG_remap &other, int& inconsistent, int& added) {
144  const int *primary = remap_tab;
145  int *secondary = other.remap_tab;
146  bool increased_len = false;
147 
148  if (other.in_length>in_length) {
149  // re-use memory of bigger map
150  std::swap(other.in_length, in_length);
151  std::swap(other.remap_tab, remap_tab);
152 
153  increased_len = true;
154  }
155  out_length = std::max(out_length, other.out_length); // remember biggest output length
156 
157  int mixlen = std::min(in_length, other.in_length);
158 
159  // eliminate inconsistant positions from secondary mapping (forward)
160  inconsistent = 0;
161  {
162  int max_target_pos = 0;
163  for (int pos = 0; pos<mixlen; ++pos) {
164  max_target_pos = std::max(max_target_pos, primary[pos]);
165  if (secondary[pos]<max_target_pos) {
166  if (secondary[pos] != NO_POSITION) {
167 #if defined(DUMP_INCONSISTENCIES)
168  fprintf(stderr, "merge-inconsistency: pos=%i primary[]=%i secondary[]=%i max_target_pos=%i\n",
169  pos, primary[pos], secondary[pos], max_target_pos);
170 #endif // DUMP_INCONSISTENCIES
171  inconsistent++;
172  }
173  secondary[pos] = NO_POSITION; // consistency error -> ignore position
174  }
175  }
176  }
177  // (backward)
178  {
179  int min_target_pos = out_length-1;
180  for (int pos = mixlen-1; pos >= 0; --pos) {
181  if (primary[pos] >= 0 && primary[pos]<min_target_pos) min_target_pos = primary[pos];
182  if (secondary[pos] > min_target_pos) {
183 #if defined(DUMP_INCONSISTENCIES)
184  fprintf(stderr, "merge-inconsistency: pos=%i primary[]=%i secondary[]=%i min_target_pos=%i\n",
185  pos, primary[pos], secondary[pos], min_target_pos);
186 #endif // DUMP_INCONSISTENCIES
187  inconsistent++;
188  secondary[pos] = NO_POSITION; // consistency error -> ignore position
189  }
190  }
191  }
192 
193  // merge mappings
194  added = 0;
195  for (int pos = 0; pos < mixlen; ++pos) {
196  if (primary[pos] == NO_POSITION) {
197  remap_tab[pos] = secondary[pos];
198  added += (remap_tab[pos] != NO_POSITION);
199  }
200  else {
201  remap_tab[pos] = primary[pos];
202  }
203  // remap_tab[pos] = primary[pos] == NO_POSITION ? secondary[pos] : primary[pos];
204  mg_assert(remap_tab[pos]<out_length);
205  }
206 
207  if (increased_len) { // count positions appended at end of sequence
208  for (int pos = other.in_length; pos<in_length; ++pos) {
209  added += (remap_tab[pos] != NO_POSITION);
210  }
211  }
212 
213  // Note: copying the rest from larger mapping is not necessary
214  // (it's already there, because of memory-reuse)
215 }
216 
217 const char *MG_remap::add_reference(const char *in_reference, const char *out_reference) {
218  // returns warning about inconsistencies/useless reference
219  const char *warning = NULp;
220 
221  if (have_softmapping()) forget_softmapping();
222 
223  if (!remap_tab) {
224  in_length = strlen(in_reference);
225  out_length = strlen(out_reference);
226  remap_tab = build_initial_mapping(in_reference, in_length, out_reference, out_length);
227 #if defined(DUMP_MAPPING)
228  dump_remap("initial");
229 #endif // DUMP_MAPPING
230  }
231  else {
232  MG_remap tmp;
233  tmp.add_reference(in_reference, out_reference);
234 
235  int inconsistent, added;
236  merge_mapping(tmp, inconsistent, added);
237 
238  if (inconsistent>0) warning = GBS_global_string("contains %i inconsistent positions", inconsistent);
239  if (added<1) {
240  const char *useless_warning = "doesn't add useful information";
241  if (warning) warning = GBS_global_string("%s and %s", warning, useless_warning);
242  else warning = useless_warning;
243  }
244 
245 #if defined(DUMP_MAPPING)
246  dump_remap("merged");
247 #endif // DUMP_MAPPING
248  }
249 
250  return warning;
251 }
252 
253 void MG_remap::create_softmapping() const {
254  soft_remap_tab = new int[in_length];
255 
256  int last_fixed_position = NO_POSITION;
257  int pos;
258  for (pos = 0; pos<in_length && last_fixed_position == NO_POSITION; ++pos) {
259  if (remap_tab[pos] == NO_POSITION) {
260  soft_remap_tab[pos] = LEFT_BORDER;
261  }
262  else {
263  soft_remap_tab[pos] = NO_POSITION;
264  last_fixed_position = pos;
265  }
266  }
267  if (last_fixed_position != NO_POSITION) {
268  for ( ; pos<in_length; ++pos) {
269  if (remap_tab[pos] != NO_POSITION) {
270  int softstart = last_fixed_position+1;
271  int softsize = pos-softstart;
272 
273  if (softsize>0) {
274  int target_softstart = remap_tab[last_fixed_position]+1;
275  int target_softsize = remap_tab[pos]-target_softstart;
276 
277  double target_step;
278  if (softsize>1 && target_softsize>1) {
279  target_step = double(target_softsize-1)/(softsize-1);
280  }
281  else {
282  target_step = 0.0;
283  }
284 
285  if (target_step >= 1.0 && target_softsize>softsize) {
286  // target range > source range -> split softmapping in the middle
287  int halfsoftsize = softsize/2;
288  int target_softpos = softstart;
289  int off;
290  for (off = 0; off<halfsoftsize; ++off) { // LOOP_VECTORIZED
291  soft_remap_tab[softstart+off] = target_softpos++;
292  }
293  target_softpos += target_softsize-softsize;
294  for (; off<softsize; ++off) { // LOOP_VECTORIZED
295  soft_remap_tab[softstart+off] = target_softpos++;
296  }
297  }
298  else {
299  double target_softpos = target_softstart;
300  for (int off = 0; off<softsize; ++off) {
301  soft_remap_tab[softstart+off] = int(target_softpos+0.5);
302  target_softpos += target_step;
303  }
304  }
305  }
306  last_fixed_position = pos;
307  soft_remap_tab[pos] = NO_POSITION;
308  }
309  }
310 
311  for (--pos; pos>last_fixed_position; --pos) { // LOOP_VECTORIZED
312  soft_remap_tab[pos] = RIGHT_BORDER;
313  }
314  }
315 
316 #if defined(DUMP_MAPPING)
317  dump(soft_remap_tab, in_length, "softmap", -1);
318 #endif // DUMP_MAPPING
319 }
320 
321 static void drop_dots(softbaselist& softbases, int excessive_positions) {
322  // drop consecutive dots
323  bool justseendot = false;
324 
325  softbaseiter next = softbases.begin();
326  while (excessive_positions && next != softbases.end()) {
327  bool isdot = (next->base == '.');
328  if (isdot && justseendot) {
329  excessive_positions--;
330  next = softbases.erase(next);
331  }
332  else {
333  justseendot = isdot;
334  ++next;
335  }
336  }
337 
338  if (excessive_positions) {
339  // drop single dots
340  next = softbases.begin();
341  while (excessive_positions && next != softbases.end()) {
342  if (next->base == '.') {
343  next = softbases.erase(next);
344  }
345  else {
346  ++next;
347  }
348  }
349  }
350 }
351 
352 #if defined(DUMP_SOFTMAPPING)
353 static char *softbaselist_2_string(const softbaselist& softbases) {
354  GBS_strstruct out(softbases.size()*100);
355 
356  for (const_softbaseiter base = softbases.begin(); base != softbases.end(); ++base) {
357  const char *info = GBS_global_string(" %c'%c' %i->",
358  base->last_gapchar ? base->last_gapchar : ' ',
359  base->base,
360  base->origin);
361  out.cat(info);
362  if (base->targetpos == NO_POSITION) {
363  out.cat("NP");
364  }
365  else {
366  out.putlong(base->targetpos);
367  }
368  }
369 
370  return out.release();
371 }
372 #endif // DUMP_SOFTMAPPING
373 
374 char *MG_remap::calc_softmapping(softbaselist& softbases, int start, int end, int& outlen) const {
377 
378  int wanted_size = end-start+1;
379  int listsize = softbases.size();
380 
381 #if defined(DUMP_SOFTMAPPING)
382  char *sbl_initial = softbaselist_2_string(softbases);
383  char *sbl_exdots = NULp;
384  char *sbl_target = NULp;
385  char *sbl_exclash = NULp;
386 #endif // DUMP_SOFTMAPPING
387 
388  if (listsize>wanted_size) {
389  int excessive_positions = listsize-wanted_size;
390  drop_dots(softbases, excessive_positions);
391  listsize = softbases.size();
392 
393 #if defined(DUMP_SOFTMAPPING)
394  sbl_exdots = softbaselist_2_string(softbases);
395 #endif // DUMP_SOFTMAPPING
396  }
397 
398  char *result = NULp;
399  if (listsize >= wanted_size) { // not or just enough space -> plain copy
400  ARB_alloc(result, listsize+1);
401  *std::copy(softbases.begin(), softbases.end(), result) = 0;
402  outlen = listsize;
403  }
404  else { // otherwise do soft-mapping
405  ARB_alloc(result, wanted_size+1);
406  mg_assert(listsize < wanted_size);
407 
408  // calculate target positions and detect mapping conflicts
409  bool conflicts = false;
410  {
411  int lasttargetpos = NO_POSITION;
412  for (softbaseiter base = softbases.begin(); base != softbases.end(); ++base) {
413  // // int targetpos = calc_softpos(base->origin);
414  int targetpos = soft_remap_tab[base->origin];
415  if (targetpos == lasttargetpos) {
416  mg_assert(targetpos != NO_POSITION);
417  conflicts = true;
418  }
419  base->targetpos = lasttargetpos = targetpos;
420  }
421  }
422 
423 #if defined(DUMP_SOFTMAPPING)
424  sbl_target = softbaselist_2_string(softbases);
425 #endif // // DUMP_SOFTMAPPING
426 
427  if (conflicts) {
428  int nextpos = end+1;
429  for (softbaselist::reverse_iterator base = softbases.rbegin(); base != softbases.rend(); ++base) {
430  if (base->targetpos >= nextpos) {
431  base->targetpos = nextpos-1;
432  }
433  nextpos = base->targetpos;
434  mg_assert(base->targetpos >= start);
435  mg_assert(base->targetpos <= end);
436  }
437  mg_assert(nextpos >= start);
438 
439 #if defined(DUMP_SOFTMAPPING)
440  sbl_exclash = softbaselist_2_string(softbases);
441 #endif // // DUMP_SOFTMAPPING
442  }
443 
444  int idx = 0;
445  for (softbaseiter base = softbases.begin(); base != softbases.end(); ++base) {
446  int pos = base->targetpos - start;
447 
448  if (idx<pos) {
449  char gapchar = base->last_gapchar;
450  while (idx<pos) result[idx++] = gapchar;
451  }
452  result[idx++] = base->base;
453  }
454  result[idx] = 0;
455  outlen = idx;
456  mg_assert(idx <= wanted_size);
457  }
458 
459 #if defined(DUMP_SOFTMAPPING)
460  fflush(stdout);
461  fflush(stderr);
462  printf("initial:%s\n", sbl_initial);
463  if (sbl_exdots) printf("exdots :%s\n", sbl_exdots);
464  if (sbl_target) printf("target :%s\n", sbl_target);
465  if (sbl_exclash) printf("exclash:%s\n", sbl_exclash);
466  printf("calc_softmapping(%i, %i) -> \"%s\"\n", start, end, result);
467  fflush(stdout);
468  free(sbl_exclash);
469  free(sbl_target);
470  free(sbl_exdots);
471  free(sbl_initial);
472 #endif // DUMP_SOFTMAPPING
473 
474  return result;
475 }
476 
477 int MG_remap::softmap_to(softbaselist& softbases, int start, int end, GBS_strstruct& outs) const {
478  int mappedlen;
479  char *softmapped = calc_softmapping(softbases, start, end, mappedlen);
480 
481  outs.cat(softmapped);
482  free(softmapped);
483  softbases.clear();
484 
485  return mappedlen;
486 }
487 
488 char *MG_remap::remap(const char *sequence) const {
489  int slen = strlen(sequence);
490  int minlen = std::min(slen, in_length);
491  GBS_strstruct outs(out_length+1);
492  int written = 0;
493  softbaselist softbases;
494  int pos;
495 
496  if (!have_softmapping()) create_softmapping();
497 
498  // remap left border
499  for (pos = 0; pos<in_length && soft_remap_tab[pos] == LEFT_BORDER; ++pos) {
500  char c = pos<slen ? sequence[pos] : '-';
501  if (!GAP::is_std_gap(c)) {
502  mg_assert(pos<slen); // @@@ simplify code above
503  softbases.push_back(softbase(c, pos, '.'));
504  }
505  }
506  char last_gapchar = 0;
507  {
508  int bases = softbases.size();
509  if (bases) {
510  int fixed = remap_tab[pos];
511  mg_assert(fixed != NO_POSITION);
512 
513  int bases_start = fixed-bases;
514  if (written<bases_start) {
515  outs.nput('.', bases_start-written);
516  written = bases_start;
517  }
518 
519  written += softmap_to(softbases, written, fixed-1, outs);
520  }
521  else {
522  last_gapchar = '.';
523  }
524  }
525 
526 
527  // remap center (fixed mapping and softmapping inbetween)
528  for (; pos<in_length; ++pos) {
529  char c = pos<slen ? sequence[pos] : '-';
530  int target_pos = remap_tab[pos];
531 
532  if (target_pos == NO_POSITION) { // softmap
533  if (c == '-') {
534  if (!last_gapchar) last_gapchar = '-';
535  }
536  else {
537  if (!last_gapchar) last_gapchar = c == '.' ? '.' : '-';
538 
539  softbases.push_back(softbase(c, pos, last_gapchar)); // remember bases for softmapping
540  if (c != '.') last_gapchar = 0;
541  }
542  }
543  else { // fixed mapping
544  if (!softbases.empty()) {
545  written += softmap_to(softbases, written, target_pos-1, outs);
546  }
547  if (written<target_pos) {
548  if (!last_gapchar) last_gapchar = c == '.' ? '.' : '-';
549 
550  outs.nput(last_gapchar, target_pos-written);
551  written = target_pos;
552  }
553 
554  if (c == '-') {
555  if (!last_gapchar) last_gapchar = '-';
556  outs.put(last_gapchar); written++;
557  }
558  else {
559  outs.put(c); written++;
560  if (c != '.') last_gapchar = 0;
561  }
562  }
563  }
564 
565  // Spool leftover softbases.
566  // Happens when
567  // - sequence ends before last fixed position
568  // - sequence starts after last fixed position
569 
570  if (!softbases.empty()) {
571  // softmap_to(softbases, written, written+softbases.size()-1, outs);
572  softmap_to(softbases, written, written, outs);
573  }
574  mg_assert(softbases.empty());
575 
576  // copy overlength rest of sequence:
577  const char *gap_chars = "- .";
578  for (int i = minlen; i < slen; i++) {
579  int c = sequence[i];
580  if (!strchr(gap_chars, c)) outs.put(c); // don't fill with gaps
581  }
582 
583  // convert end of seq ('-' -> '.')
584  {
585  const char *out = outs.get_data();
586  int end = outs.get_position();
587 
588  int terminal_gaps; // count terminal gaps...
589  for (terminal_gaps = 0; terminal_gaps<end; ++terminal_gaps) {
590  char c = out[end-1-terminal_gaps];
591  if (c != '-' and c != '.') break;
592  }
593 
594  if (terminal_gaps) {
595  // ... and replace them by dots:
596  outs.cut_tail(terminal_gaps);
597  outs.nput('.', terminal_gaps);
598  }
599  }
600 
601  return outs.release();
602 }
603 
604 // --------------------------------------------------------------------------------
605 
606 static MG_remap *MG_create_remap(GBDATA *gb_left, GBDATA *gb_right, const char *reference_species_names, const char *alignment_name) {
607  MG_remap *rem = new MG_remap;
608  char *ref_list = ARB_strdup(reference_species_names);
609 
610  for (char *tok = strtok(ref_list, " \n,;"); tok; tok = strtok(NULp, " \n,;")) {
611  bool is_SAI = strncmp(tok, "SAI:", 4) == 0;
612  GBDATA *gb_species_left = NULp;
613  GBDATA *gb_species_right = NULp;
614 
615  if (is_SAI) {
616  gb_species_left = GBT_find_SAI(gb_left, tok+4);
617  gb_species_right = GBT_find_SAI(gb_right, tok+4);
618  }
619  else {
620  gb_species_left = GBT_find_species(gb_left, tok);
621  gb_species_right = GBT_find_species(gb_right, tok);
622  }
623 
624  if (!gb_species_left || !gb_species_right) {
625  aw_message(GBS_global_string("Warning: Couldn't find %s'%s' in %s DB.\nPlease read ADAPT ALIGNMENT section in help file!",
626  is_SAI ? "" : "species ",
627  tok,
628  gb_species_left ? "right" : "left"));
629  }
630  else {
631  // look for sequence/SAI "data"
632  GBDATA *gb_seq_left = GBT_find_sequence(gb_species_left, alignment_name);
633  GBDATA *gb_seq_right = GBT_find_sequence(gb_species_right, alignment_name);
634 
635  if (gb_seq_left && gb_seq_right) {
636  GB_TYPES type_left = GB_read_type(gb_seq_left);
637  GB_TYPES type_right = GB_read_type(gb_seq_right);
638 
639  if (type_left != type_right) {
640  aw_message(GBS_global_string("Warning: data type of '%s' differs in both databases", tok));
641  }
642  else {
643  const char *warning;
644  if (type_right == GB_STRING) {
645  warning = rem->add_reference(GB_read_char_pntr(gb_seq_left), GB_read_char_pntr(gb_seq_right));
646  }
647  else {
648  char *sleft = GB_read_as_string(gb_seq_left);
649  char *sright = GB_read_as_string(gb_seq_right);
650 
651  warning = rem->add_reference(sleft, sright);
652 
653  free(sleft);
654  free(sright);
655  }
656 
657  if (warning) aw_message(GBS_global_string("Warning: '%s' %s", tok, warning));
658  }
659  }
660  }
661  }
662  free(ref_list);
663 
664  return rem;
665 }
666 
667 // --------------------------------------------------------------------------------
668 
669 MG_remaps::MG_remaps(GBDATA *gb_left, GBDATA *gb_right, bool enable, const char *reference_species_names)
670  : n_remaps(0),
671  remaps(NULp)
672 {
673  if (enable) { // otherwise nothing will be remapped!
674  GBT_get_alignment_names(alignment_names, gb_left);
675  for (n_remaps = 0; alignment_names[n_remaps]; n_remaps++) {} // count alignments
676 
677  ARB_calloc(remaps, n_remaps);
678  for (int i = 0; i<n_remaps; ++i) {
679  remaps[i] = MG_create_remap(gb_left, gb_right, reference_species_names, alignment_names[i]);
680  }
681  }
682 }
683 
685  mg_assert(correlated(n_remaps, remaps));
686  for (int i=0; i<n_remaps; i++) delete remaps[i];
687  free(remaps);
688 }
689 
690 static GB_ERROR adaptCopiedAlignment(const MG_remap& remap, GBDATA *source_species, GBDATA *destination_species, const char *alignment_name) {
691  // align sequence after copy
692  GB_ERROR error = NULp;
693 
694  GBDATA *gb_seq_left = GBT_find_sequence(source_species, alignment_name);
695  GBDATA *gb_seq_right = GBT_find_sequence(destination_species, alignment_name);
696 
697  if (gb_seq_left && gb_seq_right) { // if one DB hasn't sequence -> sequence was not copied
698  char *ls = GB_read_string(gb_seq_left);
699  char *rs = GB_read_string(gb_seq_right);
700 
701  if (strcmp(ls, rs) == 0) { // if sequences are not identical -> sequence was not copied
702  free(rs);
703  rs = remap.remap(ls);
704 
705  if (!rs) error = GB_await_error();
706  else {
707  long old_check = GBS_checksum(ls, 0, ".- ");
708  long new_check = GBS_checksum(rs, 0, ".- ");
709 
710  if (old_check == new_check) {
711  error = GB_write_string(gb_seq_right, rs);
712  }
713  else {
714  error = GBS_global_string("Failed to adapt alignment of '%s' (checksum changed)", GBT_get_name_or_description(source_species));
715  }
716  }
717  }
718  free(rs);
719  free(ls);
720  }
721 
722  return error;
723 }
724 
725 GB_ERROR MG_adaptAllCopiedAlignments(const MG_remaps& remaps, GBDATA *source_species, GBDATA *destination_species) {
730  GB_ERROR error = NULp;
731 
732  if (remaps.doesRemap()) {
733  for (int i=0; i<remaps.size() && !error; i++) {
734  error = adaptCopiedAlignment(remaps.remap(i), source_species, destination_species, remaps.alignment_name(i));
735  }
736  }
737 
738  return error;
739 }
740 
741 // --------------------------------------------------------------------------------
742 
743 #ifdef UNIT_TESTS
744 
745 #include <test_unit.h>
746 
747 // #define VERBOOSE_REMAP_TESTS
748 
749 #if defined(VERBOOSE_REMAP_TESTS)
750 #define DUMP_REMAP(id, comment, from, to) fprintf(stderr, "%s %s '%s' -> '%s'\n", id, comment, from, to)
751 #define DUMP_REMAP_STR(str) fputs(str, stderr)
752 #else
753 #define DUMP_REMAP(id, comment, from, to)
754 #define DUMP_REMAP_STR(str)
755 #endif
756 
757 #define TEST_REMAP(id, remapper, ASS_EQ, seqold, expected, got) \
758  char *calculated = remapper.remap(seqold); \
759  DUMP_REMAP(id, "want", seqold, expected); \
760  DUMP_REMAP(id, "calc", seqold, calculated); \
761  ASS_EQ(calculated, expected, got); \
762  DUMP_REMAP_STR("\n"); \
763  free(calculated);
764 
765 #define TEST_REMAP1REF_INT(id, ASS_EQ, refold, refnew, seqold, expected, got) \
766  do { \
767  MG_remap remapper; \
768  DUMP_REMAP(id, "ref ", refold, refnew); \
769  remapper.add_reference(refold, refnew); \
770  TEST_REMAP(id, remapper, ASS_EQ, seqold, expected, got); \
771  } while(0)
772 
773 #define TEST_REMAP2REFS_INT(id, ASS_EQ, refold1, refnew1, refold2, refnew2, seqold, expected, got) \
774  do { \
775  MG_remap remapper; \
776  DUMP_REMAP(id, "ref1", refold1, refnew1); \
777  DUMP_REMAP(id, "ref2", refold2, refnew2); \
778  remapper.add_reference(refold1, refnew1); \
779  remapper.add_reference(refold2, refnew2); \
780  TEST_REMAP(id, remapper, ASS_EQ, seqold, expected, got); \
781  } while(0)
782 
783 #define TEST_REMAP1REF(id,ro,rn,seqold,expected) TEST_REMAP1REF_INT(id, TEST_EXPECT_EQUAL__IGNARG, ro, rn, seqold, expected, NULp)
784 #define TEST_REMAP1REF__BROKEN(id,ro,rn,seqold,expected,got) TEST_REMAP1REF_INT(id, TEST_EXPECT_EQUAL__BROKEN, ro, rn, seqold, expected, got)
785 
786 #define TEST_REMAP2REFS(id,ro1,rn1,ro2,rn2,seqold,expected) TEST_REMAP2REFS_INT(id, TEST_EXPECT_EQUAL__IGNARG, ro1, rn1, ro2, rn2, seqold, expected, NULp)
787 
788 #define TEST_REMAP1REF_FWDREV(id, ro, rn, so, sn) \
789  TEST_REMAP1REF(id "(fwd)", ro, rn, so, sn); \
790  TEST_REMAP1REF(id "(rev)", rn, ro, sn, so)
791 
792 #define TEST_REMAP1REF_ALLDIR(id, ro, rn, so, sn) \
793  TEST_REMAP1REF_FWDREV(id "/ref=ref", ro, rn, so, sn); \
794  TEST_REMAP1REF_FWDREV(id "/ref=src", so, sn, ro, rn)
795 
796 #define TEST_REMAP2REFS_FWDREV(id, ro1, rn1, ro2, rn2, so, sn) \
797  TEST_REMAP2REFS(id "(fwd)", ro1, rn1, ro2, rn2, so, sn); \
798  TEST_REMAP2REFS(id "(rev)", rn1, ro1, rn2, ro2, sn, so)
799 
800 #define TEST_REMAP2REFS_ALLDIR(id, ro1, rn1, ro2, rn2, so, sn) \
801  TEST_REMAP2REFS_FWDREV(id "/ref=r1+r2 ", ro1, rn1, ro2, rn2, so, sn); \
802  TEST_REMAP2REFS_FWDREV(id "/ref=r1+src", ro1, rn1, so, sn, ro2, rn2); \
803  TEST_REMAP2REFS_FWDREV(id "/ref=r2+src", ro2, rn2, so, sn, ro1, rn1)
804 
805 __ATTR__REDUCED_OPTIMIZE__NO_GCSE void TEST_remapping() {
806  // ----------------------------------------
807  // remap with 1 reference
808 
809  TEST_REMAP1REF_ALLDIR("simple gap",
810  "ACGT", "AC--GT",
811  "TGCA", "TG--CA");
812 
813  TEST_REMAP1REF_FWDREV("dotgap",
814  "ACGT", "AC..GT", // dots in reference do not get propagated
815  "TGCA", "TG--CA");
816 
817  TEST_REMAP1REF("unused position (1)",
818  "A-C-G-T", "A--C--G--T",
819  "A---G-T", "A-----G--T");
820  TEST_REMAP1REF("unused position (2)",
821  "A-C-G-T", "A--C--G--T",
822  "Az-z-zT", "Az--z--z-T");
823 
824  TEST_REMAP1REF("empty (1)",
825  "A-C-G", "A--C--G",
826  "", ".......");
827  TEST_REMAP1REF("empty (2)",
828  "...A-C-G...", "...A--C--G...",
829  "", "..........");
830 
831  TEST_REMAP1REF("leadonly",
832  "...A-C-G...", "...A--C--G...",
833  "XX", ".XX.......");
834  TEST_REMAP1REF("trailonly",
835  "...A-C-G...", "...A--C--G...",
836  ".........XX", "..........XX");
837  TEST_REMAP1REF__BROKEN("lead+trail",
838  "...A-C-G...", "...A--C--G...",
839  "XX.......XX", ".XX-------XX",
840  ".XX.......XX");
841 
842  TEST_REMAP1REF("enforce leading dots (1)",
843  "--A-T", "------A--T",
844  "--T-A", "......T--A"); // leading gaps shall always be dots
845  TEST_REMAP1REF("enforce leading dots (2)",
846  "---A-T", "------A--T",
847  "-.-T-A", "......T--A"); // leading gaps shall always be dots
848  TEST_REMAP1REF("enforce leading dots (3)",
849  "---A-T", "------A--T",
850  "...T-A", "......T--A"); // leading gaps shall always be dots
851  TEST_REMAP1REF("enforce leading dots (4)",
852  "-..--A-T", "--.---A--T",
853  "-.-.-T-A", "......T--A"); // leading gaps shall always be dots
854  TEST_REMAP1REF("enforce leading dots (5)",
855  "---A-T-", "---A----T",
856  "-----A-", "........A"); // leading gaps shall always be dots
857  TEST_REMAP1REF("enforce leading dots (6)",
858  "---A-T-", "---A----T",
859  ".....A-", "........A"); // leading gaps shall always be dots
860 
861  TEST_REMAP1REF("no trailing gaps",
862  "A-T", "A--T---",
863  "T-A", "T--A");
864 
865  TEST_REMAP1REF("should expand full-dot-gaps (1)",
866  "AC-GT", "AC--GT",
867  "TG.CA", "TG..CA"); // if gap was present and only contained dots -> new gap should only contain dots as well
868  TEST_REMAP1REF("should expand full-dot-gaps (2)",
869  "AC--GT", "AC----GT",
870  "TG..CA", "TG....CA"); // if gap was present and only contained dots -> new gap should only contain dots as well
871 
872  TEST_REMAP1REF("keep 'missing bases' (1)",
873  "AC---GT", "AC---GT",
874  "TG-.-CA", "TG-.-CA"); // missing bases should not be deleted
875 
876  TEST_REMAP1REF("keep 'missing bases' (2)",
877  "AC-D-GT", "AC-D-GT",
878  "TG-.-CA", "TG-.-CA"); // missing bases should not be deleted
879 
880  // ----------------------------------------
881  // remap with 2 references
882  TEST_REMAP2REFS_ALLDIR("simple 3-way",
883  "A-CA-C-T", "AC-A---CT",
884  "A---GC-T", "A----G-CT",
885  "T-GAC--A", "TG-A-C--A");
886 
887  TEST_REMAP2REFS("undefpos-nogap(2U)",
888  "---A", "------A",
889  "C---", "C------",
890  "GUUG", "GU---UG");
891  TEST_REMAP2REFS("undefpos-nogap(3U)",
892  "C----", "C------",
893  "----A", "------A",
894  "GUUUG", "GU--UUG");
895  TEST_REMAP2REFS("undefpos-nogap(4U)",
896  "-----A", "------A",
897  "C-----", "C------",
898  "GUUUUG", "GUU-UUG");
899  TEST_REMAP2REFS("undefpos-nogap(4U2)",
900  "-----A", "-------A",
901  "C-----", "C-------",
902  "GUUUUG", "GUU--UUG");
903 
904  TEST_REMAP2REFS("undefpos1-gapleft",
905  "----A", "------A",
906  "C----", "C------",
907  "G-UUG", "G---UUG");
908  TEST_REMAP2REFS("undefpos1-gapright",
909  "----A", "------A",
910  "C----", "C------",
911  "GUU-G", "GU--U-G"); // behavior changed (prior: "GUU---G")
912 
913  // test non-full sequences
914  TEST_REMAP2REFS("missing ali-pos (ref1-source)",
915  "C", "C--", // in-seq missing 1 ali pos
916  "-A", "--A",
917  "GG", "G-G");
918 
919  TEST_REMAP2REFS("missing ali-pos (ref2-source)",
920  "-A", "--A",
921  "C", "C--", // in-seq missing 1 ali pos
922  "GG", "G-G");
923 
924  TEST_REMAP2REFS("missing ali-pos (ref1-target)",
925  "C-", "C", // out-seq missing 2 ali pos
926  "-A", "--A",
927  "GG", "G-G");
928  TEST_REMAP2REFS("missing ali-pos (ref2-target)",
929  "-A", "--A",
930  "C-", "C", // out-seq missing 2 ali pos
931  "GG", "G-G");
932 
933  TEST_REMAP2REFS("missing ali-pos (seq-source)",
934  "C--", "C--",
935  "-A-", "--A",
936  "GG", "G-G"); // in-seq missing 1 ali pos
937 
938  // --------------------
939  // test (too) small gaps
940 
941  TEST_REMAP1REF("gap gets too small (1)",
942  "A---T---A", "A--T--A",
943  "AGGGT---A", "AGGGT-A");
944 
945  TEST_REMAP1REF("gap gets too small (2)",
946  "A---T---A", "A--T--A",
947  "AGGGT--GA", "AGGGTGA");
948 
949  TEST_REMAP1REF("gap gets too small (3)",
950  "A---T---A", "A--T--A",
951  "AGGGT-G-A", "AGGGTGA");
952 
953  TEST_REMAP1REF("gap gets too small (4)",
954  "A---T---A", "A--T--A",
955  "AGGGTG--A", "AGGGTGA");
956 
957  TEST_REMAP1REF("gap tightens to fit (1)",
958  "A---T---A", "A--T--A",
959  "AGG-T---A", "AGGT--A");
960 
961  TEST_REMAP1REF("gap tightens to fit (2)",
962  "A---T---A", "A--T--A",
963  "AGC-T---A", "AGCT--A");
964  TEST_REMAP1REF("gap tightens to fit (2)",
965  "A---T---A", "A--T--A",
966  "A-CGT---A", "ACGT--A");
967 
968  TEST_REMAP1REF("gap with softmapping conflict (1)",
969  "A-------A", "A----A",
970  "A-CGT---A", "ACGT-A");
971  TEST_REMAP1REF("gap with softmapping conflict (2)",
972  "A-------A", "A----A",
973  "A--CGT--A", "ACGT-A");
974  TEST_REMAP1REF("gap with softmapping conflict (3)",
975  "A-------A", "A----A",
976  "A---CGT-A", "A-CGTA");
977  TEST_REMAP1REF("gap with softmapping conflict (4)",
978  "A-------A", "A----A",
979  "A----CGTA", "A-CGTA");
980  TEST_REMAP1REF("gap with softmapping conflict (5)",
981  "A-------A", "A----A",
982  "AC----GTA", "AC-GTA");
983 
984  TEST_REMAP1REF("drop missing bases to avoid misalignment (1)",
985  "A---T", "A--T",
986  "AG.GT", "AGGT");
987  TEST_REMAP1REF("drop missing bases to avoid misalignment (2)",
988  "A----T", "A--T",
989  "AG..GT", "AGGT");
990  TEST_REMAP1REF("dont drop missing bases if fixed map",
991  "A-C-T", "A-CT",
992  "AG.GT", "AG.GT");
993  TEST_REMAP1REF("dont drop gaps if fixed map",
994  "A-C-T", "A-CT",
995  "AG-GT", "AG-GT");
996 
997  // --------------------
998 
999  TEST_REMAP1REF("append rest of seq (1)",
1000  "AT" , "A--T",
1001  "AGG---T...A...", "A--GGTA");
1002  TEST_REMAP1REF("append rest of seq (2)",
1003  "AT." , "A--T--.",
1004  "AGG---T...A...", "A--GGTA");
1005  TEST_REMAP1REF("append rest of seq (3)",
1006  "AT--" , "A--T---",
1007  "AGG---T...A...", "A--GGTA");
1008  TEST_REMAP1REF("append rest of seq (4)",
1009  "ATC" , "A--T--C",
1010  "AGG---T...A...", "A--G--GTA");
1011  TEST_REMAP1REF("append rest of seq (4)",
1012  "AT-C" , "A--T--C",
1013  "AGG---T...A...", "A--GG--TA");
1014 
1015  // --------------------
1016 
1017  TEST_REMAP2REFS("impossible references",
1018  "AC-TA", "A---CT--A", // impossible references
1019  "A-GTA", "AG---T--A",
1020  "TGCAT", "T---GCA-T"); // solve by insertion (partial misalignment)
1021 
1022  TEST_REMAP2REFS("impossible references",
1023  "AC-TA", "A--C-T--A", // impossible references
1024  "A-GTA", "AG---T--A",
1025  "TGCAT", "T--GCA--T"); // solve by insertion (partial misalignment)
1026 }
1027 
1028 #endif // UNIT_TESTS
const char * GB_ERROR
Definition: arb_core.h:25
string result
void cut_tail(size_t byte_count)
Definition: arb_strbuf.h:145
GB_ERROR GB_write_string(GBDATA *gbd, const char *s)
Definition: arbdb.cxx:1387
softbase(char base_, int origin_, char last_gapchar_)
void GBT_get_alignment_names(ConstStrArray &names, GBDATA *gbd)
Definition: adali.cxx:317
char * ARB_strdup(const char *str)
Definition: arb_string.h:27
char * GB_read_as_string(GBDATA *gbd)
Definition: arbdb.cxx:1060
const int NO_POSITION
MG_remaps(GBDATA *gb_left, GBDATA *gb_right, bool enable, const char *reference_species_names)
const char * GBS_global_string(const char *templat,...)
Definition: arb_msg.cxx:203
void warning(int warning_num, const char *warning_message)
Definition: util.cxx:61
static char * alignment_name
char * release()
Definition: arb_strbuf.h:129
void nput(char c, size_t count)
Definition: arb_strbuf.h:164
CONSTEXPR_INLINE int calc_signed_digits(NUM val)
Definition: arbtools.h:210
void cat(const char *from)
Definition: arb_strbuf.h:183
static HelixNrInfo * start
GBDATA * GBT_find_SAI(GBDATA *gb_main, const char *name)
Definition: aditem.cxx:177
GB_ERROR MG_adaptAllCopiedAlignments(const MG_remaps &remaps, GBDATA *source_species, GBDATA *destination_species)
GB_ERROR GB_await_error()
Definition: arb_msg.cxx:342
fflush(stdout)
GB_TYPES GB_read_type(GBDATA *gbd)
Definition: arbdb.cxx:1643
TYPE * ARB_alloc(size_t nelem)
Definition: arb_mem.h:56
static void drop_dots(softbaselist &softbases, int excessive_positions)
softbaselist::const_iterator const_softbaseiter
CONSTEXPR_INLINE int digits(int parts)
#define mg_assert(bed)
Definition: merge.hxx:24
static void error(const char *msg)
Definition: mkptypes.cxx:96
fputc('\n', stderr)
const char * add_reference(const char *in_reference, const char *out_reference)
CONSTEXPR_INLINE_Cxx14 void swap(unsigned char &c1, unsigned char &c2)
Definition: ad_io_inline.h:19
const MG_remap & remap(int idx) const
const char * alignment_name(int idx) const
char * remap(const char *sequence) const
const int LEFT_BORDER
GBDATA * GBT_find_sequence(GBDATA *gb_species, const char *aliname)
Definition: adali.cxx:708
static GB_ERROR adaptCopiedAlignment(const MG_remap &remap, GBDATA *source_species, GBDATA *destination_species, const char *alignment_name)
static void copy(double **i, double **j)
Definition: trnsprob.cxx:32
const int RIGHT_BORDER
TYPE * ARB_calloc(size_t nelem)
Definition: arb_mem.h:81
int size() const
static MG_remap * MG_create_remap(GBDATA *gb_left, GBDATA *gb_right, const char *reference_species_names, const char *alignment_name)
char * GB_read_string(GBDATA *gbd)
Definition: arbdb.cxx:909
uint32_t GBS_checksum(const char *seq, int ignore_case, const char *exclude)
Definition: adstring.cxx:352
void aw_message(const char *msg)
Definition: AW_status.cxx:1142
bool is_std_gap(const char c)
#define NULp
Definition: cxxforward.h:114
GBDATA * GBT_find_species(GBDATA *gb_main, const char *name)
Definition: aditem.cxx:139
#define __ATTR__REDUCED_OPTIMIZE__NO_GCSE
Definition: test_unit.h:88
const char * get_data() const
Definition: arb_strbuf.h:120
GB_TYPES
Definition: arbdb.h:62
GB_CSTR GB_read_char_pntr(GBDATA *gbd)
Definition: arbdb.cxx:904
softbaselist::iterator softbaseiter
GB_CSTR GBT_get_name_or_description(GBDATA *gb_item)
Definition: aditem.cxx:459
#define min(a, b)
Definition: f2c.h:153
static int info[maxsites+1]
bool doesRemap() const
char last_gapchar
size_t get_position() const
Definition: arb_strbuf.h:112
std::list< softbase > softbaselist
void put(char c)
Definition: arb_strbuf.h:158
#define max(a, b)
Definition: f2c.h:154