ARB
MG_adapt_ali.cxx
Go to the documentation of this file.
1 // =============================================================== //
2 // //
3 // File : MG_adapt_ali.cxx //
4 // Purpose : //
5 // //
6 // Institute of Microbiology (Technical University Munich) //
7 // http://www.arb-home.de/ //
8 // //
9 // =============================================================== //
10 
11 #include "merge.hxx"
12 #include "MG_adapt_ali.hxx"
13 
14 #include <aw_msg.hxx>
15 #include <arbdbt.h>
16 #include <arb_strbuf.h>
17 #include <arb_global_defs.h>
18 
19 #include <cmath>
20 #include <list>
21 
22 #if defined(DEBUG)
23 // #define DUMP_MAPPING
24 // #define DUMP_SOFTMAPPING
25 #endif // DEBUG
26 
27 // -----------------
28 // MG_remap
29 
30 const int NO_POSITION = -1;
31 const int LEFT_BORDER = -2;
32 const int RIGHT_BORDER = -3;
33 
34 struct softbase {
35  char base;
36  int origin; // position in source alignment
37  char last_gapchar; // last gap seen before base
38  int targetpos; // target position
39 
40  softbase(char base_, int origin_, char last_gapchar_) :
41  base(base_),
42  origin(origin_),
43  last_gapchar(last_gapchar_),
44  targetpos(NO_POSITION)
45  {
46  mg_assert(last_gapchar);
47  }
48 
49  operator char () const { return base; }
50 };
51 
52 typedef std::list<softbase> softbaselist;
53 typedef softbaselist::iterator softbaseiter;
54 typedef softbaselist::const_iterator const_softbaseiter;
55 
56 class MG_remap : virtual Noncopyable { // @@@ rename MG_remap -> AliRemapper
57  int in_length;
58  int out_length;
59  int *remap_tab; // fixed mapping (targetPosition or NO_POSITION)
60 
61  // soft-mapping:
62  mutable int *soft_remap_tab; // soft-mapping (NO_POSITION, targetPosition, LEFT_BORDER or RIGHT_BORDER)
63 
64  char *calc_softmapping(softbaselist& softbases, int start, int end, int &outlen) const;
65  int softmap_to(softbaselist& softbases, int start, int end, GBS_strstruct *outs) const;
66 
67  bool have_softmapping() const { return soft_remap_tab; }
68  void create_softmapping() const;
69  void forget_softmapping() const {
70  delete [] soft_remap_tab;
71  soft_remap_tab = NULp;
72  }
73 
74  static int *build_initial_mapping(const char *iref, int ilen, const char *oref, int olen);
75  void merge_mapping(MG_remap &other, int& inconsistent, int& added);
76 
77 public:
78 
80  in_length(0),
81  out_length(0),
82  remap_tab(NULp),
83  soft_remap_tab(NULp)
84  {}
86  forget_softmapping();
87  delete [] remap_tab;
88  }
89 
90  const char *add_reference(const char *in_reference, const char *out_reference); // returns only warnings
91  char *remap(const char *sequence) const; // returns 0 on error, else copy of sequence
92 
93 #if defined(DUMP_MAPPING)
94  static void dump(const int *data, int len, const char *comment, int dontShow) {
95  fflush(stdout);
96  fflush(stderr);
97  fputc('>', stdout);
98  int digits = calc_signed_digits(len);
99  for (int pos = 0; pos<len; ++pos) {
100  if (data[pos] == dontShow) {
101  fprintf(stdout, "%*s", digits, "_");
102  }
103  else {
104  fprintf(stdout, "%*i", digits, data[pos]);
105  }
106  }
107  fprintf(stdout, " (%s)\n", comment);
108  fflush(stdout);
109  }
110  void dump_remap(const char *comment) { dump(remap_tab, in_length, comment, NO_POSITION); }
111 #endif // DUMP_MAPPING
112 };
113 
114 int *MG_remap::build_initial_mapping(const char *iref, int ilen, const char *oref, int olen) {
115  int *remap = new int[ilen];
116 
117  const char *spacers = "-. n";
118 
119  int ipos = 0;
120  int opos = 0;
121 
122  while (ipos<ilen && opos<olen) {
123  size_t ispaces = strspn(iref+ipos, spacers);
124  size_t ospaces = strspn(oref+opos, spacers);
125 
126  while (ispaces && ipos<ilen) {
127  remap[ipos++] = NO_POSITION;
128  ispaces--;
129  }
130  opos += ospaces;
131  if (ipos<ilen && opos<olen) remap[ipos++] = opos++;
132  }
133  while (ipos<ilen) remap[ipos++] = NO_POSITION;
134 
135  return remap;
136 }
137 
138 #if defined(DEBUG)
139 // #define DUMP_INCONSISTENCIES
140 #endif // DEBUG
141 
142 
143 void MG_remap::merge_mapping(MG_remap &other, int& inconsistent, int& added) {
144  const int *primary = remap_tab;
145  int *secondary = other.remap_tab;
146  bool increased_len = false;
147 
148  if (other.in_length>in_length) {
149  // re-use memory of bigger map
150  std::swap(other.in_length, in_length);
151  std::swap(other.remap_tab, remap_tab);
152 
153  increased_len = true;
154  }
155  out_length = std::max(out_length, other.out_length); // remember biggest output length
156 
157  int mixlen = std::min(in_length, other.in_length);
158 
159  // eliminate inconsistant positions from secondary mapping (forward)
160  inconsistent = 0;
161  {
162  int max_target_pos = 0;
163  for (int pos = 0; pos<mixlen; ++pos) {
164  max_target_pos = std::max(max_target_pos, primary[pos]);
165  if (secondary[pos]<max_target_pos) {
166  if (secondary[pos] != NO_POSITION) {
167 #if defined(DUMP_INCONSISTENCIES)
168  fprintf(stderr, "merge-inconsistency: pos=%i primary[]=%i secondary[]=%i max_target_pos=%i\n",
169  pos, primary[pos], secondary[pos], max_target_pos);
170 #endif // DUMP_INCONSISTENCIES
171  inconsistent++;
172  }
173  secondary[pos] = NO_POSITION; // consistency error -> ignore position
174  }
175  }
176  }
177  // (backward)
178  {
179  int min_target_pos = out_length-1;
180  for (int pos = mixlen-1; pos >= 0; --pos) {
181  if (primary[pos] >= 0 && primary[pos]<min_target_pos) min_target_pos = primary[pos];
182  if (secondary[pos] > min_target_pos) {
183 #if defined(DUMP_INCONSISTENCIES)
184  fprintf(stderr, "merge-inconsistency: pos=%i primary[]=%i secondary[]=%i min_target_pos=%i\n",
185  pos, primary[pos], secondary[pos], min_target_pos);
186 #endif // DUMP_INCONSISTENCIES
187  inconsistent++;
188  secondary[pos] = NO_POSITION; // consistency error -> ignore position
189  }
190  }
191  }
192 
193  // merge mappings
194  added = 0;
195  for (int pos = 0; pos < mixlen; ++pos) {
196  if (primary[pos] == NO_POSITION) {
197  remap_tab[pos] = secondary[pos];
198  added += (remap_tab[pos] != NO_POSITION);
199  }
200  else {
201  remap_tab[pos] = primary[pos];
202  }
203  // remap_tab[pos] = primary[pos] == NO_POSITION ? secondary[pos] : primary[pos];
204  mg_assert(remap_tab[pos]<out_length);
205  }
206 
207  if (increased_len) { // count positions appended at end of sequence
208  for (int pos = other.in_length; pos<in_length; ++pos) {
209  added += (remap_tab[pos] != NO_POSITION);
210  }
211  }
212 
213  // Note: copying the rest from larger mapping is not necessary
214  // (it's already there, because of memory-reuse)
215 }
216 
217 const char *MG_remap::add_reference(const char *in_reference, const char *out_reference) {
218  // returns warning about inconsistencies/useless reference
219  const char *warning = NULp;
220 
221  if (have_softmapping()) forget_softmapping();
222 
223  if (!remap_tab) {
224  in_length = strlen(in_reference);
225  out_length = strlen(out_reference);
226  remap_tab = build_initial_mapping(in_reference, in_length, out_reference, out_length);
227 #if defined(DUMP_MAPPING)
228  dump_remap("initial");
229 #endif // DUMP_MAPPING
230  }
231  else {
232  MG_remap tmp;
233  tmp.add_reference(in_reference, out_reference);
234 
235  int inconsistent, added;
236  merge_mapping(tmp, inconsistent, added);
237 
238  if (inconsistent>0) warning = GBS_global_string("contains %i inconsistent positions", inconsistent);
239  if (added<1) {
240  const char *useless_warning = "doesn't add useful information";
241  if (warning) warning = GBS_global_string("%s and %s", warning, useless_warning);
242  else warning = useless_warning;
243  }
244 
245 #if defined(DUMP_MAPPING)
246  dump_remap("merged");
247 #endif // DUMP_MAPPING
248  }
249 
250  return warning;
251 }
252 
253 void MG_remap::create_softmapping() const {
254  soft_remap_tab = new int[in_length];
255 
256  int last_fixed_position = NO_POSITION;
257  int pos;
258  for (pos = 0; pos<in_length && last_fixed_position == NO_POSITION; ++pos) {
259  if (remap_tab[pos] == NO_POSITION) {
260  soft_remap_tab[pos] = LEFT_BORDER;
261  }
262  else {
263  soft_remap_tab[pos] = NO_POSITION;
264  last_fixed_position = pos;
265  }
266  }
267  if (last_fixed_position != NO_POSITION) {
268  for ( ; pos<in_length; ++pos) {
269  if (remap_tab[pos] != NO_POSITION) {
270  int softstart = last_fixed_position+1;
271  int softsize = pos-softstart;
272 
273  if (softsize>0) {
274  int target_softstart = remap_tab[last_fixed_position]+1;
275  int target_softsize = remap_tab[pos]-target_softstart;
276 
277  double target_step;
278  if (softsize>1 && target_softsize>1) {
279  target_step = double(target_softsize-1)/(softsize-1);
280  }
281  else {
282  target_step = 0.0;
283  }
284 
285  if (target_step >= 1.0 && target_softsize>softsize) {
286  // target range > source range -> split softmapping in the middle
287  int halfsoftsize = softsize/2;
288  int target_softpos = softstart;
289  int off;
290  for (off = 0; off<halfsoftsize; ++off) { // LOOP_VECTORIZED
291  soft_remap_tab[softstart+off] = target_softpos++;
292  }
293  target_softpos += target_softsize-softsize;
294  for (; off<softsize; ++off) { // LOOP_VECTORIZED
295  soft_remap_tab[softstart+off] = target_softpos++;
296  }
297  }
298  else {
299  double target_softpos = target_softstart;
300  for (int off = 0; off<softsize; ++off) {
301  soft_remap_tab[softstart+off] = int(target_softpos+0.5);
302  target_softpos += target_step;
303  }
304  }
305  }
306  last_fixed_position = pos;
307  soft_remap_tab[pos] = NO_POSITION;
308  }
309  }
310 
311  for (--pos; pos>last_fixed_position; --pos) { // LOOP_VECTORIZED
312  soft_remap_tab[pos] = RIGHT_BORDER;
313  }
314  }
315 
316 #if defined(DUMP_MAPPING)
317  dump(soft_remap_tab, in_length, "softmap", -1);
318 #endif // DUMP_MAPPING
319 }
320 
321 static void drop_dots(softbaselist& softbases, int excessive_positions) {
322  // drop consecutive dots
323  bool justseendot = false;
324 
325  softbaseiter next = softbases.begin();
326  while (excessive_positions && next != softbases.end()) {
327  bool isdot = (next->base == '.');
328  if (isdot && justseendot) {
329  excessive_positions--;
330  next = softbases.erase(next);
331  }
332  else {
333  justseendot = isdot;
334  ++next;
335  }
336  }
337 
338  if (excessive_positions) {
339  // drop single dots
340  next = softbases.begin();
341  while (excessive_positions && next != softbases.end()) {
342  if (next->base == '.') {
343  next = softbases.erase(next);
344  }
345  else {
346  ++next;
347  }
348  }
349  }
350 }
351 
352 #if defined(DUMP_SOFTMAPPING)
353 static char *softbaselist_2_string(const softbaselist& softbases) {
354  GBS_strstruct *out = GBS_stropen(softbases.size()*100);
355 
356  for (const_softbaseiter base = softbases.begin(); base != softbases.end(); ++base) {
357  const char *info = GBS_global_string(" %c'%c' %i->",
358  base->last_gapchar ? base->last_gapchar : ' ',
359  base->base,
360  base->origin);
361  GBS_strcat(out, info);
362  if (base->targetpos == NO_POSITION) {
363  GBS_strcat(out, "NP");
364  }
365  else {
366  GBS_intcat(out, base->targetpos);
367  }
368  }
369 
370  return GBS_strclose(out);
371 }
372 #endif // DUMP_SOFTMAPPING
373 
374 char *MG_remap::calc_softmapping(softbaselist& softbases, int start, int end, int& outlen) const {
377 
378  int wanted_size = end-start+1;
379  int listsize = softbases.size();
380 
381 #if defined(DUMP_SOFTMAPPING)
382  char *sbl_initial = softbaselist_2_string(softbases);
383  char *sbl_exdots = NULp;
384  char *sbl_target = NULp;
385  char *sbl_exclash = NULp;
386 #endif // DUMP_SOFTMAPPING
387 
388  if (listsize>wanted_size) {
389  int excessive_positions = listsize-wanted_size;
390  drop_dots(softbases, excessive_positions);
391  listsize = softbases.size();
392 
393 #if defined(DUMP_SOFTMAPPING)
394  sbl_exdots = softbaselist_2_string(softbases);
395 #endif // DUMP_SOFTMAPPING
396  }
397 
398  char *result = NULp;
399  if (listsize >= wanted_size) { // not or just enough space -> plain copy
400  ARB_alloc(result, listsize+1);
401  *std::copy(softbases.begin(), softbases.end(), result) = 0;
402  outlen = listsize;
403  }
404  else { // otherwise do soft-mapping
405  ARB_alloc(result, wanted_size+1);
406  mg_assert(listsize < wanted_size);
407 
408  // calculate target positions and detect mapping conflicts
409  bool conflicts = false;
410  {
411  int lasttargetpos = NO_POSITION;
412  for (softbaseiter base = softbases.begin(); base != softbases.end(); ++base) {
413  // // int targetpos = calc_softpos(base->origin);
414  int targetpos = soft_remap_tab[base->origin];
415  if (targetpos == lasttargetpos) {
416  mg_assert(targetpos != NO_POSITION);
417  conflicts = true;
418  }
419  base->targetpos = lasttargetpos = targetpos;
420  }
421  }
422 
423 #if defined(DUMP_SOFTMAPPING)
424  sbl_target = softbaselist_2_string(softbases);
425 #endif // // DUMP_SOFTMAPPING
426 
427  if (conflicts) {
428  int nextpos = end+1;
429  for (softbaselist::reverse_iterator base = softbases.rbegin(); base != softbases.rend(); ++base) {
430  if (base->targetpos >= nextpos) {
431  base->targetpos = nextpos-1;
432  }
433  nextpos = base->targetpos;
434  mg_assert(base->targetpos >= start);
435  mg_assert(base->targetpos <= end);
436  }
437  mg_assert(nextpos >= start);
438 
439 #if defined(DUMP_SOFTMAPPING)
440  sbl_exclash = softbaselist_2_string(softbases);
441 #endif // // DUMP_SOFTMAPPING
442  }
443 
444  int idx = 0;
445  for (softbaseiter base = softbases.begin(); base != softbases.end(); ++base) {
446  int pos = base->targetpos - start;
447 
448  if (idx<pos) {
449  char gapchar = base->last_gapchar;
450  while (idx<pos) result[idx++] = gapchar;
451  }
452  result[idx++] = base->base;
453  }
454  result[idx] = 0;
455  outlen = idx;
456  mg_assert(idx <= wanted_size);
457  }
458 
459 #if defined(DUMP_SOFTMAPPING)
460  fflush(stdout);
461  fflush(stderr);
462  printf("initial:%s\n", sbl_initial);
463  if (sbl_exdots) printf("exdots :%s\n", sbl_exdots);
464  if (sbl_target) printf("target :%s\n", sbl_target);
465  if (sbl_exclash) printf("exclash:%s\n", sbl_exclash);
466  printf("calc_softmapping(%i, %i) -> \"%s\"\n", start, end, result);
467  fflush(stdout);
468  free(sbl_exclash);
469  free(sbl_target);
470  free(sbl_exdots);
471  free(sbl_initial);
472 #endif // DUMP_SOFTMAPPING
473 
474  return result;
475 }
476 
477 int MG_remap::softmap_to(softbaselist& softbases, int start, int end, GBS_strstruct *outs) const {
478  int mappedlen;
479  char *softmapped = calc_softmapping(softbases, start, end, mappedlen);
480 
481  GBS_strcat(outs, softmapped);
482  free(softmapped);
483  softbases.clear();
484 
485  return mappedlen;
486 }
487 
488 char *MG_remap::remap(const char *sequence) const {
489  int slen = strlen(sequence);
490  int minlen = std::min(slen, in_length);
491  GBS_strstruct *outs = GBS_stropen(out_length+1);
492  int written = 0;
493  softbaselist softbases;
494  int pos;
495 
496  if (!have_softmapping()) create_softmapping();
497 
498  // remap left border
499  for (pos = 0; pos<in_length && soft_remap_tab[pos] == LEFT_BORDER; ++pos) {
500  char c = pos<slen ? sequence[pos] : '-';
501  if (!GAP::is_std_gap(c)) {
502  mg_assert(pos<slen); // @@@ simplify code above
503  softbases.push_back(softbase(c, pos, '.'));
504  }
505  }
506  char last_gapchar = 0;
507  {
508  int bases = softbases.size();
509  if (bases) {
510  int fixed = remap_tab[pos];
511  mg_assert(fixed != NO_POSITION);
512 
513  int bases_start = fixed-bases;
514  if (written<bases_start) {
515  GBS_chrncat(outs, '.', bases_start-written);
516  written = bases_start;
517  }
518 
519  written += softmap_to(softbases, written, fixed-1, outs);
520  }
521  else {
522  last_gapchar = '.';
523  }
524  }
525 
526 
527  // remap center (fixed mapping and softmapping inbetween)
528  for (; pos<in_length; ++pos) {
529  char c = pos<slen ? sequence[pos] : '-';
530  int target_pos = remap_tab[pos];
531 
532  if (target_pos == NO_POSITION) { // softmap
533  if (c == '-') {
534  if (!last_gapchar) last_gapchar = '-';
535  }
536  else {
537  if (!last_gapchar) last_gapchar = c == '.' ? '.' : '-';
538 
539  softbases.push_back(softbase(c, pos, last_gapchar)); // remember bases for softmapping
540  if (c != '.') last_gapchar = 0;
541  }
542  }
543  else { // fixed mapping
544  if (!softbases.empty()) {
545  written += softmap_to(softbases, written, target_pos-1, outs);
546  }
547  if (written<target_pos) {
548  if (!last_gapchar) last_gapchar = c == '.' ? '.' : '-';
549 
550  GBS_chrncat(outs, last_gapchar, target_pos-written);
551  written = target_pos;
552  }
553 
554  if (c == '-') {
555  if (!last_gapchar) last_gapchar = '-';
556  GBS_chrcat(outs, last_gapchar); written++;
557  }
558  else {
559  GBS_chrcat(outs, c); written++;
560  if (c != '.') last_gapchar = 0;
561  }
562  }
563  }
564 
565  // Spool leftover softbases.
566  // Happens when
567  // - sequence ends before last fixed position
568  // - sequence starts after last fixed position
569 
570  if (!softbases.empty()) {
571  // softmap_to(softbases, written, written+softbases.size()-1, outs);
572  softmap_to(softbases, written, written, outs);
573  }
574  mg_assert(softbases.empty());
575 
576  // copy overlength rest of sequence:
577  const char *gap_chars = "- .";
578  for (int i = minlen; i < slen; i++) {
579  int c = sequence[i];
580  if (!strchr(gap_chars, c)) GBS_chrcat(outs, c); // don't fill with gaps
581  }
582 
583  // convert end of seq ('-' -> '.')
584  {
585  char *out = GBS_mempntr(outs);
586  int end = GBS_memoffset(outs);
587 
588  for (int off = end-1; off >= 0; --off) {
589  if (out[off] == '-') out[off] = '.';
590  else if (out[off] != '.') break;
591  }
592  }
593 
594  return GBS_strclose(outs);
595 }
596 
597 // --------------------------------------------------------------------------------
598 
599 static MG_remap *MG_create_remap(GBDATA *gb_left, GBDATA *gb_right, const char *reference_species_names, const char *alignment_name) {
600  MG_remap *rem = new MG_remap;
601  char *ref_list = ARB_strdup(reference_species_names);
602 
603  for (char *tok = strtok(ref_list, " \n,;"); tok; tok = strtok(NULp, " \n,;")) {
604  bool is_SAI = strncmp(tok, "SAI:", 4) == 0;
605  GBDATA *gb_species_left = NULp;
606  GBDATA *gb_species_right = NULp;
607 
608  if (is_SAI) {
609  gb_species_left = GBT_find_SAI(gb_left, tok+4);
610  gb_species_right = GBT_find_SAI(gb_right, tok+4);
611  }
612  else {
613  gb_species_left = GBT_find_species(gb_left, tok);
614  gb_species_right = GBT_find_species(gb_right, tok);
615  }
616 
617  if (!gb_species_left || !gb_species_right) {
618  aw_message(GBS_global_string("Warning: Couldn't find %s'%s' in %s DB.\nPlease read ADAPT ALIGNMENT section in help file!",
619  is_SAI ? "" : "species ",
620  tok,
621  gb_species_left ? "right" : "left"));
622  }
623  else {
624  // look for sequence/SAI "data"
625  GBDATA *gb_seq_left = GBT_find_sequence(gb_species_left, alignment_name);
626  GBDATA *gb_seq_right = GBT_find_sequence(gb_species_right, alignment_name);
627 
628  if (gb_seq_left && gb_seq_right) {
629  GB_TYPES type_left = GB_read_type(gb_seq_left);
630  GB_TYPES type_right = GB_read_type(gb_seq_right);
631 
632  if (type_left != type_right) {
633  aw_message(GBS_global_string("Warning: data type of '%s' differs in both databases", tok));
634  }
635  else {
636  const char *warning;
637  if (type_right == GB_STRING) {
638  warning = rem->add_reference(GB_read_char_pntr(gb_seq_left), GB_read_char_pntr(gb_seq_right));
639  }
640  else {
641  char *sleft = GB_read_as_string(gb_seq_left);
642  char *sright = GB_read_as_string(gb_seq_right);
643 
644  warning = rem->add_reference(sleft, sright);
645 
646  free(sleft);
647  free(sright);
648  }
649 
650  if (warning) aw_message(GBS_global_string("Warning: '%s' %s", tok, warning));
651  }
652  }
653  }
654  }
655  free(ref_list);
656 
657  return rem;
658 }
659 
660 // --------------------------------------------------------------------------------
661 
662 MG_remaps::MG_remaps(GBDATA *gb_left, GBDATA *gb_right, bool enable, const char *reference_species_names)
663  : n_remaps(0),
664  remaps(NULp)
665 {
666  if (enable) { // otherwise nothing will be remapped!
667  GBT_get_alignment_names(alignment_names, gb_left);
668  for (n_remaps = 0; alignment_names[n_remaps]; n_remaps++) {} // count alignments
669 
670  ARB_calloc(remaps, n_remaps);
671  for (int i = 0; i<n_remaps; ++i) {
672  remaps[i] = MG_create_remap(gb_left, gb_right, reference_species_names, alignment_names[i]);
673  }
674  }
675 }
676 
678  mg_assert(correlated(n_remaps, remaps));
679  for (int i=0; i<n_remaps; i++) delete remaps[i];
680  free(remaps);
681 }
682 
683 static GB_ERROR adaptCopiedAlignment(const MG_remap& remap, GBDATA *source_species, GBDATA *destination_species, const char *alignment_name) {
684  // align sequence after copy
685  GB_ERROR error = NULp;
686 
687  GBDATA *gb_seq_left = GBT_find_sequence(source_species, alignment_name);
688  GBDATA *gb_seq_right = GBT_find_sequence(destination_species, alignment_name);
689 
690  if (gb_seq_left && gb_seq_right) { // if one DB hasn't sequence -> sequence was not copied
691  char *ls = GB_read_string(gb_seq_left);
692  char *rs = GB_read_string(gb_seq_right);
693 
694  if (strcmp(ls, rs) == 0) { // if sequences are not identical -> sequence was not copied
695  free(rs);
696  rs = remap.remap(ls);
697 
698  if (!rs) error = GB_await_error();
699  else {
700  long old_check = GBS_checksum(ls, 0, ".- ");
701  long new_check = GBS_checksum(rs, 0, ".- ");
702 
703  if (old_check == new_check) {
704  error = GB_write_string(gb_seq_right, rs);
705  }
706  else {
707  error = GBS_global_string("Failed to adapt alignment of '%s' (checksum changed)", GBT_get_name_or_description(source_species));
708  }
709  }
710  }
711  free(rs);
712  free(ls);
713  }
714 
715  return error;
716 }
717 
718 GB_ERROR MG_adaptAllCopiedAlignments(const MG_remaps& remaps, GBDATA *source_species, GBDATA *destination_species) {
723  GB_ERROR error = NULp;
724 
725  if (remaps.doesRemap()) {
726  for (int i=0; i<remaps.size() && !error; i++) {
727  error = adaptCopiedAlignment(remaps.remap(i), source_species, destination_species, remaps.alignment_name(i));
728  }
729  }
730 
731  return error;
732 }
733 
734 // --------------------------------------------------------------------------------
735 
736 #ifdef UNIT_TESTS
737 
738 #include <test_unit.h>
739 
740 // #define VERBOOSE_REMAP_TESTS
741 
742 #if defined(VERBOOSE_REMAP_TESTS)
743 #define DUMP_REMAP(id, comment, from, to) fprintf(stderr, "%s %s '%s' -> '%s'\n", id, comment, from, to)
744 #define DUMP_REMAP_STR(str) fputs(str, stderr)
745 #else
746 #define DUMP_REMAP(id, comment, from, to)
747 #define DUMP_REMAP_STR(str)
748 #endif
749 
750 #define TEST_REMAP(id, remapper, ASS_EQ, seqold, expected, got) \
751  char *calculated = remapper.remap(seqold); \
752  DUMP_REMAP(id, "want", seqold, expected); \
753  DUMP_REMAP(id, "calc", seqold, calculated); \
754  ASS_EQ(calculated, expected, got); \
755  DUMP_REMAP_STR("\n"); \
756  free(calculated);
757 
758 #define TEST_REMAP1REF_INT(id, ASS_EQ, refold, refnew, seqold, expected, got) \
759  do { \
760  MG_remap remapper; \
761  DUMP_REMAP(id, "ref ", refold, refnew); \
762  remapper.add_reference(refold, refnew); \
763  TEST_REMAP(id, remapper, ASS_EQ, seqold, expected, got); \
764  } while(0)
765 
766 #define TEST_REMAP2REFS_INT(id, ASS_EQ, refold1, refnew1, refold2, refnew2, seqold, expected, got) \
767  do { \
768  MG_remap remapper; \
769  DUMP_REMAP(id, "ref1", refold1, refnew1); \
770  DUMP_REMAP(id, "ref2", refold2, refnew2); \
771  remapper.add_reference(refold1, refnew1); \
772  remapper.add_reference(refold2, refnew2); \
773  TEST_REMAP(id, remapper, ASS_EQ, seqold, expected, got); \
774  } while(0)
775 
776 #define TEST_REMAP1REF(id,ro,rn,seqold,expected) TEST_REMAP1REF_INT(id, TEST_EXPECT_EQUAL__IGNARG, ro, rn, seqold, expected, NULp)
777 #define TEST_REMAP1REF__BROKEN(id,ro,rn,seqold,expected,got) TEST_REMAP1REF_INT(id, TEST_EXPECT_EQUAL__BROKEN, ro, rn, seqold, expected, got)
778 
779 #define TEST_REMAP2REFS(id,ro1,rn1,ro2,rn2,seqold,expected) TEST_REMAP2REFS_INT(id, TEST_EXPECT_EQUAL__IGNARG, ro1, rn1, ro2, rn2, seqold, expected, NULp)
780 
781 #define TEST_REMAP1REF_FWDREV(id, ro, rn, so, sn) \
782  TEST_REMAP1REF(id "(fwd)", ro, rn, so, sn); \
783  TEST_REMAP1REF(id "(rev)", rn, ro, sn, so)
784 
785 #define TEST_REMAP1REF_ALLDIR(id, ro, rn, so, sn) \
786  TEST_REMAP1REF_FWDREV(id "/ref=ref", ro, rn, so, sn); \
787  TEST_REMAP1REF_FWDREV(id "/ref=src", so, sn, ro, rn)
788 
789 #define TEST_REMAP2REFS_FWDREV(id, ro1, rn1, ro2, rn2, so, sn) \
790  TEST_REMAP2REFS(id "(fwd)", ro1, rn1, ro2, rn2, so, sn); \
791  TEST_REMAP2REFS(id "(rev)", rn1, ro1, rn2, ro2, sn, so)
792 
793 #define TEST_REMAP2REFS_ALLDIR(id, ro1, rn1, ro2, rn2, so, sn) \
794  TEST_REMAP2REFS_FWDREV(id "/ref=r1+r2 ", ro1, rn1, ro2, rn2, so, sn); \
795  TEST_REMAP2REFS_FWDREV(id "/ref=r1+src", ro1, rn1, so, sn, ro2, rn2); \
796  TEST_REMAP2REFS_FWDREV(id "/ref=r2+src", ro2, rn2, so, sn, ro1, rn1)
797 
798 void TEST_remapping() {
799  // ----------------------------------------
800  // remap with 1 reference
801 
802  TEST_REMAP1REF_ALLDIR("simple gap",
803  "ACGT", "AC--GT",
804  "TGCA", "TG--CA");
805 
806  TEST_REMAP1REF_FWDREV("dotgap",
807  "ACGT", "AC..GT", // dots in reference do not get propagated
808  "TGCA", "TG--CA");
809 
810  TEST_REMAP1REF("unused position (1)",
811  "A-C-G-T", "A--C--G--T",
812  "A---G-T", "A-----G--T");
813  TEST_REMAP1REF("unused position (2)",
814  "A-C-G-T", "A--C--G--T",
815  "Az-z-zT", "Az--z--z-T");
816 
817  TEST_REMAP1REF("empty (1)",
818  "A-C-G", "A--C--G",
819  "", ".......");
820  TEST_REMAP1REF("empty (2)",
821  "...A-C-G...", "...A--C--G...",
822  "", "..........");
823 
824  TEST_REMAP1REF("leadonly",
825  "...A-C-G...", "...A--C--G...",
826  "XX", ".XX.......");
827  TEST_REMAP1REF("trailonly",
828  "...A-C-G...", "...A--C--G...",
829  ".........XX", "..........XX");
830  TEST_REMAP1REF__BROKEN("lead+trail",
831  "...A-C-G...", "...A--C--G...",
832  "XX.......XX", ".XX-------XX",
833  ".XX.......XX");
834 
835  TEST_REMAP1REF("enforce leading dots (1)",
836  "--A-T", "------A--T",
837  "--T-A", "......T--A"); // leading gaps shall always be dots
838  TEST_REMAP1REF("enforce leading dots (2)",
839  "---A-T", "------A--T",
840  "-.-T-A", "......T--A"); // leading gaps shall always be dots
841  TEST_REMAP1REF("enforce leading dots (3)",
842  "---A-T", "------A--T",
843  "...T-A", "......T--A"); // leading gaps shall always be dots
844  TEST_REMAP1REF("enforce leading dots (4)",
845  "-..--A-T", "--.---A--T",
846  "-.-.-T-A", "......T--A"); // leading gaps shall always be dots
847  TEST_REMAP1REF("enforce leading dots (5)",
848  "---A-T-", "---A----T",
849  "-----A-", "........A"); // leading gaps shall always be dots
850  TEST_REMAP1REF("enforce leading dots (6)",
851  "---A-T-", "---A----T",
852  ".....A-", "........A"); // leading gaps shall always be dots
853 
854  TEST_REMAP1REF("no trailing gaps",
855  "A-T", "A--T---",
856  "T-A", "T--A");
857 
858  TEST_REMAP1REF("should expand full-dot-gaps (1)",
859  "AC-GT", "AC--GT",
860  "TG.CA", "TG..CA"); // if gap was present and only contained dots -> new gap should only contain dots as well
861  TEST_REMAP1REF("should expand full-dot-gaps (2)",
862  "AC--GT", "AC----GT",
863  "TG..CA", "TG....CA"); // if gap was present and only contained dots -> new gap should only contain dots as well
864 
865  TEST_REMAP1REF("keep 'missing bases' (1)",
866  "AC---GT", "AC---GT",
867  "TG-.-CA", "TG-.-CA"); // missing bases should not be deleted
868 
869  TEST_REMAP1REF("keep 'missing bases' (2)",
870  "AC-D-GT", "AC-D-GT",
871  "TG-.-CA", "TG-.-CA"); // missing bases should not be deleted
872 
873  // ----------------------------------------
874  // remap with 2 references
875  TEST_REMAP2REFS_ALLDIR("simple 3-way",
876  "A-CA-C-T", "AC-A---CT",
877  "A---GC-T", "A----G-CT",
878  "T-GAC--A", "TG-A-C--A");
879 
880  TEST_REMAP2REFS("undefpos-nogap(2U)",
881  "---A", "------A",
882  "C---", "C------",
883  "GUUG", "GU---UG");
884  TEST_REMAP2REFS("undefpos-nogap(3U)",
885  "C----", "C------",
886  "----A", "------A",
887  "GUUUG", "GU--UUG");
888  TEST_REMAP2REFS("undefpos-nogap(4U)",
889  "-----A", "------A",
890  "C-----", "C------",
891  "GUUUUG", "GUU-UUG");
892  TEST_REMAP2REFS("undefpos-nogap(4U2)",
893  "-----A", "-------A",
894  "C-----", "C-------",
895  "GUUUUG", "GUU--UUG");
896 
897  TEST_REMAP2REFS("undefpos1-gapleft",
898  "----A", "------A",
899  "C----", "C------",
900  "G-UUG", "G---UUG");
901  TEST_REMAP2REFS("undefpos1-gapright",
902  "----A", "------A",
903  "C----", "C------",
904  "GUU-G", "GU--U-G"); // behavior changed (prior: "GUU---G")
905 
906  // test non-full sequences
907  TEST_REMAP2REFS("missing ali-pos (ref1-source)",
908  "C", "C--", // in-seq missing 1 ali pos
909  "-A", "--A",
910  "GG", "G-G");
911 
912  TEST_REMAP2REFS("missing ali-pos (ref2-source)",
913  "-A", "--A",
914  "C", "C--", // in-seq missing 1 ali pos
915  "GG", "G-G");
916 
917  TEST_REMAP2REFS("missing ali-pos (ref1-target)",
918  "C-", "C", // out-seq missing 2 ali pos
919  "-A", "--A",
920  "GG", "G-G");
921  TEST_REMAP2REFS("missing ali-pos (ref2-target)",
922  "-A", "--A",
923  "C-", "C", // out-seq missing 2 ali pos
924  "GG", "G-G");
925 
926  TEST_REMAP2REFS("missing ali-pos (seq-source)",
927  "C--", "C--",
928  "-A-", "--A",
929  "GG", "G-G"); // in-seq missing 1 ali pos
930 
931  // --------------------
932  // test (too) small gaps
933 
934  TEST_REMAP1REF("gap gets too small (1)",
935  "A---T---A", "A--T--A",
936  "AGGGT---A", "AGGGT-A");
937 
938  TEST_REMAP1REF("gap gets too small (2)",
939  "A---T---A", "A--T--A",
940  "AGGGT--GA", "AGGGTGA");
941 
942  TEST_REMAP1REF("gap gets too small (3)",
943  "A---T---A", "A--T--A",
944  "AGGGT-G-A", "AGGGTGA");
945 
946  TEST_REMAP1REF("gap gets too small (4)",
947  "A---T---A", "A--T--A",
948  "AGGGTG--A", "AGGGTGA");
949 
950  TEST_REMAP1REF("gap tightens to fit (1)",
951  "A---T---A", "A--T--A",
952  "AGG-T---A", "AGGT--A");
953 
954  TEST_REMAP1REF("gap tightens to fit (2)",
955  "A---T---A", "A--T--A",
956  "AGC-T---A", "AGCT--A");
957  TEST_REMAP1REF("gap tightens to fit (2)",
958  "A---T---A", "A--T--A",
959  "A-CGT---A", "ACGT--A");
960 
961  TEST_REMAP1REF("gap with softmapping conflict (1)",
962  "A-------A", "A----A",
963  "A-CGT---A", "ACGT-A");
964  TEST_REMAP1REF("gap with softmapping conflict (2)",
965  "A-------A", "A----A",
966  "A--CGT--A", "ACGT-A");
967  TEST_REMAP1REF("gap with softmapping conflict (3)",
968  "A-------A", "A----A",
969  "A---CGT-A", "A-CGTA");
970  TEST_REMAP1REF("gap with softmapping conflict (4)",
971  "A-------A", "A----A",
972  "A----CGTA", "A-CGTA");
973  TEST_REMAP1REF("gap with softmapping conflict (5)",
974  "A-------A", "A----A",
975  "AC----GTA", "AC-GTA");
976 
977  TEST_REMAP1REF("drop missing bases to avoid misalignment (1)",
978  "A---T", "A--T",
979  "AG.GT", "AGGT");
980  TEST_REMAP1REF("drop missing bases to avoid misalignment (2)",
981  "A----T", "A--T",
982  "AG..GT", "AGGT");
983  TEST_REMAP1REF("dont drop missing bases if fixed map",
984  "A-C-T", "A-CT",
985  "AG.GT", "AG.GT");
986  TEST_REMAP1REF("dont drop gaps if fixed map",
987  "A-C-T", "A-CT",
988  "AG-GT", "AG-GT");
989 
990  // --------------------
991 
992  TEST_REMAP1REF("append rest of seq (1)",
993  "AT" , "A--T",
994  "AGG---T...A...", "A--GGTA");
995  TEST_REMAP1REF("append rest of seq (2)",
996  "AT." , "A--T--.",
997  "AGG---T...A...", "A--GGTA");
998  TEST_REMAP1REF("append rest of seq (3)",
999  "AT--" , "A--T---",
1000  "AGG---T...A...", "A--GGTA");
1001  TEST_REMAP1REF("append rest of seq (4)",
1002  "ATC" , "A--T--C",
1003  "AGG---T...A...", "A--G--GTA");
1004  TEST_REMAP1REF("append rest of seq (4)",
1005  "AT-C" , "A--T--C",
1006  "AGG---T...A...", "A--GG--TA");
1007 
1008  // --------------------
1009 
1010  TEST_REMAP2REFS("impossible references",
1011  "AC-TA", "A---CT--A", // impossible references
1012  "A-GTA", "AG---T--A",
1013  "TGCAT", "T---GCA-T"); // solve by insertion (partial misalignment)
1014 
1015  TEST_REMAP2REFS("impossible references",
1016  "AC-TA", "A--C-T--A", // impossible references
1017  "A-GTA", "AG---T--A",
1018  "TGCAT", "T--GCA--T"); // solve by insertion (partial misalignment)
1019 }
1020 
1021 #endif // UNIT_TESTS
const char * GB_ERROR
Definition: arb_core.h:25
string result
GB_ERROR GB_write_string(GBDATA *gbd, const char *s)
Definition: arbdb.cxx:1385
softbase(char base_, int origin_, char last_gapchar_)
void GBS_intcat(GBS_strstruct *strstr, long val)
Definition: arb_strbuf.cxx:127
void GBT_get_alignment_names(ConstStrArray &names, GBDATA *gbd)
Definition: adali.cxx:316
char * ARB_strdup(const char *str)
Definition: arb_string.h:27
char * GB_read_as_string(GBDATA *gbd)
Definition: arbdb.cxx:1054
const int NO_POSITION
MG_remaps(GBDATA *gb_left, GBDATA *gb_right, bool enable, const char *reference_species_names)
const char * GBS_global_string(const char *templat,...)
Definition: arb_msg.cxx:204
void warning(int warning_num, const char *warning_message)
Definition: util.cxx:61
static char * alignment_name
CONSTEXPR_INLINE int calc_signed_digits(NUM val)
Definition: arbtools.h:207
static HelixNrInfo * start
GBDATA * GBT_find_SAI(GBDATA *gb_main, const char *name)
Definition: aditem.cxx:177
GB_ERROR MG_adaptAllCopiedAlignments(const MG_remaps &remaps, GBDATA *source_species, GBDATA *destination_species)
GBS_strstruct * GBS_stropen(long init_size)
Definition: arb_strbuf.cxx:39
GB_ERROR GB_await_error()
Definition: arb_msg.cxx:353
fflush(stdout)
GB_TYPES GB_read_type(GBDATA *gbd)
Definition: arbdb.cxx:1641
TYPE * ARB_alloc(size_t nelem)
Definition: arb_mem.h:56
static void drop_dots(softbaselist &softbases, int excessive_positions)
softbaselist::const_iterator const_softbaseiter
void GBS_chrncat(GBS_strstruct *strstr, char ch, size_t n)
Definition: arb_strbuf.cxx:123
#define mg_assert(bed)
Definition: merge.hxx:24
long GBS_memoffset(GBS_strstruct *strstr)
Definition: arb_strbuf.cxx:91
void GBS_strcat(GBS_strstruct *strstr, const char *ptr)
Definition: arb_strbuf.cxx:108
static void error(const char *msg)
Definition: mkptypes.cxx:96
fputc('\n', stderr)
const char * add_reference(const char *in_reference, const char *out_reference)
CONSTEXPR_INLINE_Cxx14 void swap(unsigned char &c1, unsigned char &c2)
Definition: ad_io_inline.h:19
const MG_remap & remap(int idx) const
const char * alignment_name(int idx) const
char * remap(const char *sequence) const
const int LEFT_BORDER
GBDATA * GBT_find_sequence(GBDATA *gb_species, const char *aliname)
Definition: adali.cxx:682
static GB_ERROR adaptCopiedAlignment(const MG_remap &remap, GBDATA *source_species, GBDATA *destination_species, const char *alignment_name)
void GBS_chrcat(GBS_strstruct *strstr, char ch)
Definition: arb_strbuf.cxx:119
static void copy(double **i, double **j)
Definition: trnsprob.cxx:32
const int RIGHT_BORDER
TYPE * ARB_calloc(size_t nelem)
Definition: arb_mem.h:81
int size() const
char * GBS_mempntr(GBS_strstruct *strstr)
Definition: arb_strbuf.cxx:86
static MG_remap * MG_create_remap(GBDATA *gb_left, GBDATA *gb_right, const char *reference_species_names, const char *alignment_name)
char * GB_read_string(GBDATA *gbd)
Definition: arbdb.cxx:903
char * GBS_strclose(GBS_strstruct *strstr)
Definition: arb_strbuf.cxx:69
uint32_t GBS_checksum(const char *seq, int ignore_case, const char *exclude)
Definition: adstring.cxx:353
void aw_message(const char *msg)
Definition: AW_status.cxx:932
bool is_std_gap(const char c)
#define NULp
Definition: cxxforward.h:97
GBDATA * GBT_find_species(GBDATA *gb_main, const char *name)
Definition: aditem.cxx:139
GB_TYPES
Definition: arbdb.h:62
GB_CSTR GB_read_char_pntr(GBDATA *gbd)
Definition: arbdb.cxx:898
softbaselist::iterator softbaseiter
GB_CSTR GBT_get_name_or_description(GBDATA *gb_item)
Definition: aditem.cxx:441
#define min(a, b)
Definition: f2c.h:153
static int info[maxsites+1]
bool doesRemap() const
char last_gapchar
std::list< softbase > softbaselist
#define max(a, b)
Definition: f2c.h:154