ARB
fast_aligner.cxx
Go to the documentation of this file.
1 // =============================================================== //
2 // //
3 // File : fast_aligner.cxx //
4 // Purpose : A fast aligner (not a multiple aligner!) //
5 // //
6 // Coded by Ralf Westram (coder@reallysoft.de) in 1998 //
7 // Institute of Microbiology (Technical University Munich) //
8 // http://www.arb-home.de/ //
9 // //
10 // =============================================================== //
11 
12 
13 #include "fast_aligner.hxx"
14 #include "ClustalV.hxx"
15 #include "seq_search.hxx"
16 
17 #include <island_hopping.h>
18 
19 #include <awtc_next_neighbours.hxx>
20 #include <awt_sel_boxes.hxx>
21 
22 #include <aw_awars.hxx>
23 #include <aw_root.hxx>
24 #include <aw_question.hxx>
25 
26 #include <arbdbt.h>
27 #include <ad_cb.h>
28 
29 #include <arb_defs.h>
30 #include <arb_progress.h>
31 #include <RangeList.h>
32 
33 #include <cctype>
34 #include <cmath>
35 #include <climits>
36 
37 #include <list>
38 #include <awt_config_manager.hxx>
39 #include <rootAsWin.h>
40 
41 // --------------------------------------------------------------------------------
42 
43 #if defined(DEBUG)
44 // #define TRACE_CLUSTAL_DATA
45 // #define TRACE_ISLANDHOPPER_DATA
46 // #define TRACE_COMPRESSED_ALIGNMENT
47 // #define TRACE_RELATIVES
48 #endif // DEBUG
49 
50 // --------------------------------------------------------------------------------
51 
52 enum FA_report {
53  FA_NO_REPORT, // no report
54  FA_TEMP_REPORT, // report to temporary entries
55  FA_REPORT, // report to resident entries
56 };
57 
58 enum FA_range {
59  FA_WHOLE_SEQUENCE, // align whole sequence
60  FA_AROUND_CURSOR, // align xxx positions around current cursor position
61  FA_SELECTED_RANGE, // align selected range
62  FA_SAI_MULTI_RANGE, // align versus multi range define by SAI
63 };
64 
65 enum FA_turn {
66  FA_TURN_NEVER, // never try to turn sequence
67  FA_TURN_INTERACTIVE, // try to turn, but query user
68  FA_TURN_ALWAYS, // turn if score is better
69 };
70 
72  FA_REF_EXPLICIT, // reference sequence explicitly specified
73  FA_REF_CONSENSUS, // use group consensus as reference
74  FA_REF_RELATIVES, // search next relatives by PT server
75 };
76 
78  FA_CURRENT, // align current species
79  FA_MARKED, // align all marked species
80  FA_SELECTED, // align selected species (= range)
81 };
82 
84  FA_NO_ACTION, // do nothing
85  FA_MARK_FAILED, // mark failed species (unmark rest)
86  FA_MARK_ALIGNED, // mark aligned species (unmark rest)
87 };
88 
89 struct AlignParams {
91  bool showGapsMessages; // display messages about missing gaps in master?
92  PosRange range; // range to be aligned
93 };
94 
97  char *pt_server_alignment; // alignment used in pt_server (may differ from 'alignment')
98  int maxRelatives; // max # of relatives to use
99 
100  SearchRelativeParams(FamilyFinder *ff_, const char *pt_server_alignment_, int maxRelatives_)
101  : ff(ff_),
102  pt_server_alignment(strdup(pt_server_alignment_)),
103  maxRelatives(maxRelatives_)
104  {}
105 
107  free(pt_server_alignment);
108  delete(ff);
109  }
110 
112 };
113 
114 // --------------------------------------------------------------------------------
115 
116 #define GAP_CHAR '-'
117 #define QUALITY_NAME "ASC_ALIGNER_CLIENT_SCORE"
118 #define INSERTS_NAME "AMI_ALIGNER_MASTER_INSERTS"
119 
120 #define FA_AWAR_ROOT "faligner/"
121 #define FA_AWAR_TO_ALIGN FA_AWAR_ROOT "what"
122 #define FA_AWAR_REFERENCE FA_AWAR_ROOT "against"
123 #define FA_AWAR_REFERENCE_NAME FA_AWAR_ROOT "sagainst"
124 #define FA_AWAR_RANGE FA_AWAR_ROOT "range"
125 #define FA_AWAR_PROTECTION FA_AWAR_ROOT "protection"
126 #define FA_AWAR_AROUND FA_AWAR_ROOT "around"
127 #define FA_AWAR_MIRROR FA_AWAR_ROOT "mirror"
128 #define FA_AWAR_REPORT FA_AWAR_ROOT "report"
129 #define FA_AWAR_SHOW_GAPS_MESSAGES FA_AWAR_ROOT "show_gaps"
130 #define FA_AWAR_CONTINUE_ON_ERROR FA_AWAR_ROOT "continue_on_error"
131 #define FA_AWAR_ACTION_ON_ERROR FA_AWAR_ROOT "action_on_error"
132 #define FA_AWAR_USE_SECONDARY FA_AWAR_ROOT "use_secondary"
133 #define FA_AWAR_NEXT_RELATIVES FA_AWAR_ROOT "next_relatives"
134 #define FA_AWAR_RELATIVE_RANGE FA_AWAR_ROOT "relrange"
135 #define FA_AWAR_PT_SERVER_ALIGNMENT "tmp/" FA_AWAR_ROOT "relative_ali"
136 #define FA_AWAR_SAI_RANGE_NAME FA_AWAR_ROOT "sai/sainame"
137 #define FA_AWAR_SAI_RANGE_CHARS FA_AWAR_ROOT "sai/chars"
138 
139 #define FA_AWAR_ISLAND_HOPPING_ROOT "island_hopping/"
140 #define FA_AWAR_USE_ISLAND_HOPPING FA_AWAR_ISLAND_HOPPING_ROOT "use"
141 #define FA_AWAR_ESTIMATE_BASE_FREQ FA_AWAR_ISLAND_HOPPING_ROOT "estimate_base_freq"
142 #define FA_AWAR_BASE_FREQ_A FA_AWAR_ISLAND_HOPPING_ROOT "base_freq_a"
143 #define FA_AWAR_BASE_FREQ_C FA_AWAR_ISLAND_HOPPING_ROOT "base_freq_c"
144 #define FA_AWAR_BASE_FREQ_G FA_AWAR_ISLAND_HOPPING_ROOT "base_freq_g"
145 #define FA_AWAR_BASE_FREQ_T FA_AWAR_ISLAND_HOPPING_ROOT "base_freq_t"
146 #define FA_AWAR_SUBST_PARA_AC FA_AWAR_ISLAND_HOPPING_ROOT "subst_para_ac"
147 #define FA_AWAR_SUBST_PARA_AG FA_AWAR_ISLAND_HOPPING_ROOT "subst_para_ag"
148 #define FA_AWAR_SUBST_PARA_AT FA_AWAR_ISLAND_HOPPING_ROOT "subst_para_at"
149 #define FA_AWAR_SUBST_PARA_CG FA_AWAR_ISLAND_HOPPING_ROOT "subst_para_cg"
150 #define FA_AWAR_SUBST_PARA_CT FA_AWAR_ISLAND_HOPPING_ROOT "subst_para_ct"
151 #define FA_AWAR_SUBST_PARA_GT FA_AWAR_ISLAND_HOPPING_ROOT "subst_para_gt"
152 #define FA_AWAR_EXPECTED_DISTANCE FA_AWAR_ISLAND_HOPPING_ROOT "expected_dist"
153 #define FA_AWAR_STRUCTURE_SUPPLEMENT FA_AWAR_ISLAND_HOPPING_ROOT "struct_suppl"
154 #define FA_AWAR_THRESHOLD FA_AWAR_ISLAND_HOPPING_ROOT "threshold"
155 #define FA_AWAR_GAP_A FA_AWAR_ISLAND_HOPPING_ROOT "gapa"
156 #define FA_AWAR_GAP_B FA_AWAR_ISLAND_HOPPING_ROOT "gapb"
157 #define FA_AWAR_GAP_C FA_AWAR_ISLAND_HOPPING_ROOT "gapc"
158 
159 // --------------------------------------------------------------------------------
160 
161 static IslandHopping *island_hopper = NULp; // NULp -> use fast aligner; else use island hopper
162 
163 static GB_alignment_type global_alignmentType = GB_AT_UNKNOWN; // type of actually aligned sequence
164 
165 static int currentSequenceNumber; // used for counter
167 
168 // --------------------------------------------------------------------------------
169 
170 inline ARB_ERROR species_not_found(GB_CSTR species_name) {
171  return GBS_global_string("No species '%s' found!", species_name);
172 }
173 
174 static ARB_ERROR reverseComplement(GBDATA *gb_species, GB_CSTR ali, int max_protection) {
175  GBDATA *gbd = GBT_find_sequence(gb_species, ali);
176  ARB_ERROR error = NULp;
177 
178  if (!gbd) {
179  error = GBS_global_string("No 'data' found for species '%s'", GBT_get_name_or_description(gb_species));
180  }
181  else {
182  int my_protection = GB_read_security_write(gbd);
183 
184  if (my_protection<=max_protection) { // ok
185  char *seq = GB_read_string(gbd);
186  int length = GB_read_string_count(gbd);
187  GBDATA *gb_main = GB_get_root(gb_species);
188  GB_alignment_type ali_type = GBT_get_alignment_type(gb_main, ali);
189 
190  char T_or_U;
191  error = GBT_determine_T_or_U(ali_type, &T_or_U, "reverse-complement");
192  if (!error) {
193  GBT_reverseComplementNucSequence(seq, length, T_or_U);
194  error = GB_write_string(gbd, seq);
195  }
196  }
197  else { // protection error
198  error = GBS_global_string("Cannot reverse-complement species '%s' because of protection level", GBT_get_name_or_description(gb_species));
199  }
200 
201  }
202 
203  return error;
204 }
205 
206 static void build_reverse_complement(AW_window *aw, const AlignDataAccess *data_access) {
207  GBDATA *gb_main = data_access->gb_main;
208 
209  GB_push_transaction(gb_main);
210 
211  AW_root *root = aw->get_root();
212  FA_alignTarget revComplWhat = static_cast<FA_alignTarget>(root->awar(FA_AWAR_TO_ALIGN)->read_int());
213  char *default_alignment = GBT_get_default_alignment(gb_main);
214  GB_CSTR alignment = root->awar_string(AWAR_EDITOR_ALIGNMENT, default_alignment)->read_string();
215  ARB_ERROR error = NULp;
216  int max_protection = root->awar(FA_AWAR_PROTECTION)->read_int();
217 
218  switch (revComplWhat) {
219  case FA_CURRENT: { // current species
220  GB_CSTR species_name = root->awar(AWAR_SPECIES_NAME)->read_string();
221  GBDATA *gb_species = GBT_find_species(gb_main, species_name);
222  if (!gb_species) error = species_not_found(species_name);
223  if (!error) error = reverseComplement(gb_species, alignment, max_protection);
224  break;
225  }
226  case FA_MARKED: { // marked species
227  GBDATA *gb_species = GBT_first_marked_species(gb_main);
228 
229  if (!gb_species) {
230  error = "There is no marked species";
231  }
232  while (gb_species) {
233  error = reverseComplement(gb_species, alignment, max_protection);
234  if (error) break;
235  gb_species = GBT_next_marked_species(gb_species);
236  }
237  break;
238  }
239  case FA_SELECTED: { // selected species (editor selection!)
240 
243 
244  int count = 0;
245  GBDATA *gb_species = get_first_selected_species(&count);
246 
247  if (!gb_species) {
248  error = "There is no selected species!";
249  }
250  while (gb_species) {
251  error = reverseComplement(gb_species, alignment, max_protection);
252  if (error) break;
253  gb_species = get_next_selected_species();
254  }
255  break;
256  }
257  default: {
258  fa_assert(0);
259  break;
260  }
261  }
262 
263  GB_end_transaction_show_error(gb_main, error, aw_message);
264 }
265 
266 // --------------------------------------------------------------------------------
267 
268 class AliChange { // describes a local alignment change
269  const CompactedSubSequence& Old;
270  const CompactedSubSequence& New;
271 
272 public:
274  : Old(old_), New(new_)
275  {
277  }
278 
279  int follow(ExplicitRange& range) const {
280  ExplicitRange compressed(Old.compPosition(range.start()),
281  Old.compPosition(range.end()));
282 
283  int exp_start = New.expdPosition(compressed.start());
284  int exp_end = New.expdPosition(compressed.end());
285 
286  int gaps_before = New.no_of_gaps_before(compressed.start());
287  int gaps_after = New.no_of_gaps_after(compressed.end());
288 
289  fa_assert(gaps_before >= 0);
290  fa_assert(gaps_after >= 0);
291 
292  range = ExplicitRange(exp_start-gaps_before, exp_end+gaps_after);
293 
294  return compressed.size(); // number of bases
295  }
296 };
297 
298 class LooseBases {
299  typedef std::list<ExplicitRange> Ranges;
300 
301  Ranges ranges;
302 
303 public:
304 
305  bool is_empty() const { return ranges.empty(); }
306  void clear() { ranges.clear(); }
307 
308  void memorize(ExplicitRange range) {
309  ranges.push_front(range);
310  }
312  ExplicitRange range = ranges.front();
313  ranges.pop_front();
314  return range;
315  }
316  int follow_ali_change(const AliChange& change) {
317  // transform positions according to changed alignment (oldSequence -> newSequence)
318  // returns the number of bases contained in 'this'
319  int basecount = 0;
320  if (!is_empty()) {
321  for (Ranges::iterator r = ranges.begin(); r != ranges.end(); ++r) {
322  basecount += change.follow(*r);
323  }
324  }
325  return basecount;
326  }
327  void append(LooseBases& loose) { ranges.splice(ranges.end(), loose.ranges); }
328  int follow_ali_change_and_append(LooseBases& loose, const AliChange& change) {
329  // returns the number of loose bases (added from 'loose')
330  int basecount = loose.follow_ali_change(change);
331  append(loose);
332  return basecount;
333  }
334 };
335 
336 static LooseBases unaligned_bases; // if fast_align cannot align (no master bases) -> stores positions here
337 
338 
339 static const char *read_name(GBDATA *gbd) {
340  if (!gbd) return "GROUP-CONSENSUS";
341  const char *name = GBT_get_name(gbd);
342  return name ? name : "<unnamed-species>";
343 }
344 
345 inline int relatedBases(char base1, char base2) {
346  return baseMatch(base1, base2)==1;
347 }
348 
349 inline char alignQuality(char slave, char master) {
350  fa_assert(slave);
351  fa_assert(master);
352 
353  char result = '#';
354  if (slave==master) result = '-'; // equal
355  else if (slave==GAP_CHAR) result = '+'; // inserted gap
356  else if (master==GAP_CHAR) result = '+'; // no gap in master
357  else if (relatedBases(slave, master)) result = '~'; // mutation (related bases)
358 
359  return result; // mutation (non-related bases)
360 }
361 
362 // -------------------------
363 // Debugging stuff
364 
365 #ifdef DEBUG
366 static char *lstr(const char *s, int len) {
367  static int alloc;
368  static char *buffer;
369 
370  if (alloc<(len+1)) {
371  if (alloc) free(buffer);
372  ARB_alloc(buffer, alloc=len+100);
373  }
374 
375  memcpy(buffer, s, len);
376  buffer[len] = 0;
377 
378  return buffer;
379 }
380 
381 #define BUFLEN 120
382 
383 inline char compareChar(char base1, char base2) {
384  return base1==base2 ? '=' : (relatedBases(base1, base2) ? 'x' : 'X');
385 }
386 
387 #if defined(TRACE_COMPRESSED_ALIGNMENT)
388 
389 static void dump_n_compare_one(const char *seq1, const char *seq2, long len, long offset) {
390  fa_assert(len<=BUFLEN);
391  char compare[BUFLEN+1];
392 
393  for (long l=0; l<len; l++) {
394  compare[l] = (is_ali_gap(seq1[l]) && is_ali_gap(seq2[l])) ? ' ' : compareChar(seq1[l], seq2[l]);
395  }
396 
397  compare[len] = 0;
398 
399  printf(" %li '%s'\n", offset, lstr(seq1, len));
400  printf(" %li '%s'\n", offset, lstr(seq2, len));
401  printf(" %li '%s'\n", offset, compare);
402 }
403 
404 inline void dump_rest(const char *seq, long len, int idx, long offset) {
405  printf(" Rest von Sequenz %i:\n", idx);
406  while (len>BUFLEN) {
407  printf(" %li '%s'\n", offset, lstr(seq, BUFLEN));
408  seq += BUFLEN;
409  len -= BUFLEN;
410  offset += BUFLEN;
411  }
412 
413  fa_assert(len>0);
414  printf(" '%s'\n", lstr(seq, len));
415 }
416 
417 static void dump_n_compare(const char *text, const char *seq1, long len1, const char *seq2, long len2) {
418  long offset = 0;
419 
420  printf(" Comparing %s:\n", text);
421 
422  while (len1>0 && len2>0) {
423  long done = 0;
424 
425  if (len1>=BUFLEN && len2>=BUFLEN) {
426  dump_n_compare_one(seq1, seq2, done=BUFLEN, offset);
427  }
428  else {
429  long min = len1<len2 ? len1 : len2;
430  dump_n_compare_one(seq1, seq2, done=min, offset);
431  }
432 
433  seq1 += done;
434  seq2 += done;
435  len1 -= done;
436  len2 -= done;
437  offset += done;
438  }
439 
440  if (len1>0) dump_rest(seq1, len1, 1, offset);
441  if (len2>0) dump_rest(seq2, len2, 2, offset);
442  printf(" -------------------\n");
443 }
444 
445 static void dump_n_compare(const char *text, const CompactedSubSequence& seq1, const CompactedSubSequence& seq2) {
446  dump_n_compare(text, seq1.text(), seq1.length(), seq2.text(), seq2.length());
447 }
448 #endif // TRACE_COMPRESSED_ALIGNMENT
449 
450 #undef BUFLEN
451 
452 inline void dumpSeq(const char *seq, long len, long pos) {
453  printf("'%s' ", lstr(seq, len));
454  printf("(Pos=%li,Len=%li)", pos, len);
455 }
456 
457 #define dump() \
458  do { \
459  double sig = partSignificance(sequence().length(), slaveSequence.length(), bestLength); \
460  \
461  printf(" Score = %li (Significance=%f)\n" \
462  " Master = ", bestScore, sig); \
463  dumpSeq(bestMasterLeft.text(), bestLength, bestMasterLeft.leftOf()); \
464  printf("\n" \
465  " Slave = "); \
466  dumpSeq(bestSlaveLeft.text(), bestLength, bestSlaveLeft.leftOf()); \
467  printf("\n"); \
468  } while (0)
469 
470 #endif //DEBUG
471 
472 
473 // ------------------------------------------------
474 // INLINE-functions used in fast_align():
475 
476 inline double log3(double d) {
477  return log(d)/log(3.0);
478 }
479 inline double partSignificance(long seq1len, long seq2len, long partlen) {
480  // returns log3 of significance of the part
481  // usage: partSignificance(...) < log3(maxAllowedSignificance)
482  return log3((seq1len-partlen)*(seq2len-partlen)) - partlen;
483 }
484 
486  return "Cannot align - reserved buffer is to small";
487 }
488 
489 inline long insertsToNextBase(AlignBuffer *alignBuffer, const SequencePosition& master) {
490  int inserts;
491  int nextBase;
492 
493  if (master.rightOf()>0) {
494  nextBase = master.expdPosition();
495  }
496  else {
497  nextBase = master.sequence().expdPosition(master.sequence().length());
498  }
499  inserts = nextBase-alignBuffer->offset();
500 
501  return inserts;
502 }
503 
504 inline void insertBase(AlignBuffer *alignBuffer,
505  SequencePosition& master, SequencePosition& slave,
506  FastAlignReport *report)
507 {
508  char slaveBase = *slave.text();
509  char masterBase = *master.text();
510 
511  alignBuffer->set(slaveBase, alignQuality(slaveBase, masterBase));
512  report->count_aligned_base(slaveBase!=masterBase);
513  ++slave;
514  ++master;
515 }
516 
517 inline void insertSlaveBases(AlignBuffer *alignBuffer,
518  SequencePosition& slave,
519  int length,
520  FastAlignReport *report)
521 {
522  alignBuffer->copy(slave.text(), alignQuality(*slave.text(), GAP_CHAR), length);
523  report->count_unaligned_base(length);
524  slave += length;
525 }
526 
527 inline void insertGap(AlignBuffer *alignBuffer,
528  SequencePosition& master,
529  FastAlignReport *report)
530 {
531  char masterBase = *master.text();
532 
533  alignBuffer->set(GAP_CHAR, alignQuality(GAP_CHAR, masterBase));
534  report->count_aligned_base(masterBase!=GAP_CHAR);
535  ++master;
536 }
537 
539  SequencePosition& master,
540  SequencePosition& slave,
541  const char *masterAlignment, const char *slaveAlignment, long alignmentLength,
542  FastAlignReport *report)
543 {
544  // inserts bases of 'slave' to 'alignBuffer' according to alignment in 'masterAlignment' and 'slaveAlignment'
545 #define ACID '*' // contents of 'masterAlignment' and 'slaveAlignment'
546 #define GAP '-'
547 
548  int pos;
549  int baseAtLeft = 0; // TRUE -> last position in alignBuffer contains a base
550 
551  for (pos=0; pos<alignmentLength; pos++) {
552  long insert = insertsToNextBase(alignBuffer, master); // in expanded seq
553 
554  if (masterAlignment[pos]==ACID) {
555  if (insert>0) {
556  if (insert>alignBuffer->free()) return bufferTooSmall();
557  alignBuffer->set(GAP_CHAR, alignQuality(GAP_CHAR, GAP_CHAR), insert);
558  fa_assert(insertsToNextBase(alignBuffer, master)==0);
559  insert = 0;
560  }
561 
562  if (!alignBuffer->free()) return bufferTooSmall();
563  if (slaveAlignment[pos]==ACID) {
564  insertBase(alignBuffer, master, slave, report);
565  baseAtLeft = 1;
566  }
567  else {
568  insertGap(alignBuffer, master, report);
569  baseAtLeft = 0;
570  }
571  }
572  else {
573  int slave_bases;
574 
575  fa_assert(masterAlignment[pos]==GAP);
576  for (slave_bases=1; pos+slave_bases<alignmentLength && masterAlignment[pos+slave_bases]==GAP; slave_bases++) {
577  ; // count gaps in master (= # of slave bases to insert)
578  }
579  if (!baseAtLeft && insert>slave_bases) {
580  int ins_gaps = insert-slave_bases;
581 
582  fa_assert(alignBuffer->free()>=ins_gaps);
583  alignBuffer->set(GAP_CHAR, alignQuality(GAP_CHAR, GAP_CHAR), ins_gaps);
584  insert -= ins_gaps;
585  }
586 
587  if (insert<slave_bases) { // master has less gaps than slave bases to insert
588  report->memorize_insertion(master.expdPosition(), slave_bases-insert);
589  }
590  else if (insert>slave_bases) { // master has more gaps than slave bases to insert
591  fa_assert(baseAtLeft);
592  }
593 
594  unaligned_bases.memorize(ExplicitRange(slave.expdPosition(), // memorize base positions without counterpart in master
595  slave.expdPosition(slave_bases-1)));
596 
597  if (slave_bases>alignBuffer->free()) return bufferTooSmall();
598  insertSlaveBases(alignBuffer, slave, slave_bases, report);
599  pos += slave_bases-1; // -1 compensates for()-increment
600  baseAtLeft = 1;
601  }
602  }
603 
604  return NULp;
605 
606 #undef GAP
607 #undef ACID
608 }
609 
610 static ARB_ERROR insertAligned(AlignBuffer *alignBuffer,
611  SequencePosition& master, SequencePosition& slave, long partLength,
612  FastAlignReport *report)
613 {
614  // insert bases of 'slave' to 'alignBuffer' according to 'master'
615  if (partLength) {
616  long insert = insertsToNextBase(alignBuffer, master);
617 
618  fa_assert(partLength>=0);
619 
620  if (insert<0) { // insert gaps into master
621  long min_insert = insert;
622 
623  report->memorize_insertion(master.expdPosition(), -insert);
624 
625  while (insert<0 && partLength) {
626  if (insert<min_insert) min_insert = insert;
627  if (!alignBuffer->free()) {
628  return bufferTooSmall();
629  }
630  insertBase(alignBuffer, master, slave, report);
631  partLength--;
632  insert = insertsToNextBase(alignBuffer, master);
633  }
634 
635  fa_assert(partLength>=0);
636  if (partLength==0) { // all inserted
637  return NULp;
638  }
639  }
640 
641  fa_assert(insert>=0);
642 
643  if (insert>0) { // insert gaps into slave
644  if (insert>alignBuffer->free()) return bufferTooSmall();
645  alignBuffer->set(GAP_CHAR, alignQuality(GAP_CHAR, GAP_CHAR), insert);
646  fa_assert(insertsToNextBase(alignBuffer, master)==0);
647  }
648 
649  fa_assert(partLength>=0);
650 
651  while (partLength--) {
652  insert = insertsToNextBase(alignBuffer, master);
653 
654  fa_assert(insert>=0);
655  if (insert>0) {
656  if (insert>=alignBuffer->free()) return bufferTooSmall();
657  alignBuffer->set(GAP_CHAR, alignQuality(GAP_CHAR, GAP_CHAR), insert);
658  }
659  else {
660  if (!alignBuffer->free()) {
661  return bufferTooSmall();
662  }
663  }
664 
665  insertBase(alignBuffer, master, slave, report);
666  }
667  }
668 
669  return NULp;
670 }
671 
672 static ARB_ERROR cannot_fast_align(const CompactedSubSequence& master, long moffset, long mlength,
673  const CompactedSubSequence& slaveSequence, long soffset, long slength,
674  int max_seq_length,
675  AlignBuffer *alignBuffer,
676  FastAlignReport *report)
677 {
678  const char *mtext = master.text(moffset);
679  const char *stext = slaveSequence.text(soffset);
680  ARB_ERROR error = NULp;
681 
682  if (slength) {
683  if (mlength) { // if slave- and master-sequences contain bases, we call the slow aligner
684 #ifdef TRACE_CLUSTAL_DATA
685  printf("ClustalV-Align:\n");
686  printf(" mseq = '%s'\n", lstr(mtext, mlength));
687  printf(" sseq = '%s'\n", lstr(stext, slength));
688 #endif // TRACE_CLUSTAL_DATA
689 
690  int is_dna = -1;
691 
692  switch (global_alignmentType) {
693  case GB_AT_RNA:
694  case GB_AT_DNA: is_dna = 1; break;
695  case GB_AT_AA: is_dna = 0; break;
696  default: error = "Unknown alignment type - aligner aborted"; break;
697  }
698 
699  const char *maligned, *saligned;
700  int len;
701  if (!error) {
702  int score; // unused
703  error = ClustalV_align(is_dna,
704  1,
705  mtext, mlength, stext, slength,
706  master.gapsBefore(moffset),
707  max_seq_length,
708  maligned, saligned, len, score);
709  }
710 
711  if (!error) {
712 #ifdef TRACE_CLUSTAL_DATA
713  printf("ClustalV returns:\n");
714  printf(" maligned = '%s'\n", lstr(maligned, len));
715  printf(" saligned = '%s'\n", lstr(saligned, len));
716 #endif // TRACE_CLUSTAL_DATA
717 
718  SequencePosition masterPos(master, moffset);
719  SequencePosition slavePos(slaveSequence, soffset);
720 
721  error = insertClustalValigned(alignBuffer, masterPos, slavePos, maligned, saligned, len, report);
722 
723 #if (defined(DEBUG) && 0)
724 
725  SequencePosition master2(master->sequence(), moffset);
726  SequencePosition slave2(slaveSequence, soffset);
727  char *cmp = new char[len];
728 
729  for (int l=0; l<len; l++) {
730  int gaps = 0;
731 
732  if (maligned[l]=='*') {
733  maligned[l] = *master2.text();
734  ++master2;
735  }
736  else {
737  gaps++;
738  }
739 
740  if (saligned[l]=='*') {
741  saligned[l] = *slave2.text();
742  ++slave2;
743  }
744  else {
745  gaps++;
746  }
747 
748  cmp[l] = gaps || maligned[l]==saligned[l] ? '=' : 'X';
749  }
750 
751  printf(" master = '%s'\n", lstr(maligned, len));
752  printf(" slave = '%s'\n", lstr(saligned, len));
753  printf(" '%s'\n", lstr(cmp, len));
754 
755  delete [] cmp;
756 #endif
757  }
758  }
759  else { // master is empty here, so we just copy in the slave bases
760  if (slength<=alignBuffer->free()) {
761  unaligned_bases.memorize(ExplicitRange(slaveSequence.expdPosition(soffset),
762  slaveSequence.expdPosition(soffset+slength-1)));
763 
764  alignBuffer->copy(slaveSequence.text(soffset), '?', slength); // '?' means not aligned (just inserted)
765  // corrected by at alignBuffer->correctUnalignedPositions()
766  report->count_unaligned_base(slength);
767  }
768  else {
769  error = bufferTooSmall();
770  }
771  }
772  }
773 
774  return error;
775 }
776 
777 // ------------------------------------
778 // #define's for fast_align()
779 
780 #define TEST_BETTER_SCORE() \
781  do { \
782  if (score>bestScore) { \
783  bestScore = score; \
784  bestLength = masterRight.text() - masterLeft.text(); \
785  bestMasterLeft = masterLeft; \
786  bestSlaveLeft = slaveLeft; \
787  } \
788  } while (0)
789 
790 #define CAN_SCORE_LEFT() (masterLeft.leftOf() && slaveLeft.leftOf())
791 #define CAN_SCORE_RIGHT() (masterRight.rightOf() && slaveRight.rightOf())
792 
793 #define SCORE_LEFT() \
794  do { \
795  score += *(--masterLeft).text()==*(--slaveLeft).text() ? match : mismatch; \
796  TEST_BETTER_SCORE(); \
797  } while (0)
798 
799 #define SCORE_RIGHT() \
800  do { \
801  score += *(++masterRight).text()==*(++slaveRight).text() ? match : mismatch; \
802  TEST_BETTER_SCORE(); \
803  } while (0)
804 
805 
807  AlignBuffer *alignBuffer,
808  int max_seq_length,
809  int match, int mismatch,
810  FastAlignReport *report) const
811 {
812  // aligns 'slaveSequence' to 'this'
813  //
814  // returns
815  // NULp -> all ok -> 'alignBuffer' contains aligned sequence
816  // or error -> failure -> no results
817 
818  ARB_ERROR error = NULp;
819  int aligned = 0;
820 
821  // set the following #if to zero to test ClustalV-Aligner (not fast_aligner)
822 #if 1
823 
824  static double lowSignificance;
825  static int lowSignificanceInitialized;
826 
827  if (!lowSignificanceInitialized) {
828  lowSignificance = log3(0.01);
829  lowSignificanceInitialized = 1;
830  }
831 
832  SequencePosition slave(slaveSequence);
833  long bestScore=0;
834  SequencePosition bestMasterLeft(sequence());
835  SequencePosition bestSlaveLeft(slaveSequence);
836  long bestLength=0;
837 
838  while (slave.rightOf()>=3) { // with all triples of slaveSequence
839  FastSearchOccurrence occurrence(*this, slave.text()); // "search" first
840  SequencePosition rightmostSlave = slave;
841 
842  while (occurrence.found()) { // with all occurrences of the triple
843  long score = match*3;
844  SequencePosition masterLeft(occurrence.sequence(), occurrence.offset());
845  SequencePosition masterRight(occurrence.sequence(), occurrence.offset()+3);
846  SequencePosition slaveLeft(slave);
847  SequencePosition slaveRight(slave, 3);
848 
849  while (score>0) {
850  if (CAN_SCORE_LEFT()) {
851  SCORE_LEFT();
852  }
853  else {
854  while (score>0 && CAN_SCORE_RIGHT()) {
855  SCORE_RIGHT();
856  }
857  break;
858  }
859 
860  if (CAN_SCORE_RIGHT()) {
861  SCORE_RIGHT();
862  }
863  else {
864  while (score>0 && CAN_SCORE_LEFT()) {
865  SCORE_LEFT();
866  }
867  break;
868  }
869  }
870 
871  occurrence.gotoNext(); // "search" next
872 
873  if (rightmostSlave<slaveRight) {
874  rightmostSlave = slaveRight;
875  rightmostSlave -= 3;
876  }
877  }
878 
879  if (rightmostSlave>slave) slave = rightmostSlave;
880  else ++slave;
881  }
882 
883  if (bestLength) {
884  double sig = partSignificance(sequence().length(), slaveSequence.length(), bestLength);
885 
886  if (sig<lowSignificance) {
887  long masterLeftOf = bestMasterLeft.leftOf();
888  long masterRightStart = masterLeftOf+bestLength;
889  long masterRightOf = bestMasterLeft.rightOf()-bestLength;
890  long slaveLeftOf = bestSlaveLeft.leftOf();
891  long slaveRightStart = slaveLeftOf+bestLength;
892  long slaveRightOf = bestSlaveLeft.rightOf()-bestLength;
893 
894 #define MIN_ALIGNMENT_RANGE 4
895 
896  if (!error) {
897  if (masterLeftOf >= MIN_ALIGNMENT_RANGE && slaveLeftOf >= MIN_ALIGNMENT_RANGE) {
898  CompactedSubSequence leftCompactedMaster(sequence(), 0, masterLeftOf);
899  FastSearchSequence leftMaster(leftCompactedMaster);
900 
901  error = leftMaster.fast_align(CompactedSubSequence(slaveSequence, 0, slaveLeftOf),
902  alignBuffer, max_seq_length, match, mismatch, report);
903  }
904  else if (slaveLeftOf>0) {
905  error = cannot_fast_align(sequence(), 0, masterLeftOf,
906  slaveSequence, 0, slaveLeftOf,
907  max_seq_length, alignBuffer, report);
908  }
909 
910  aligned = 1;
911  }
912 
913  // align part of slave sequence according to master sequence:
914 
915  if (!error) {
916 #if (defined(DEBUG) && 0)
917  long offset = alignBuffer->offset();
918  long used;
919 #endif
920  error = insertAligned(alignBuffer, bestMasterLeft, bestSlaveLeft, bestLength, report);
921 #if (defined(DEBUG) && 0)
922  used = alignBuffer->offset()-offset;
923  printf("aligned '%s' (len=%li, address=%li)\n", lstr(alignBuffer->text()+offset, used), used, long(alignBuffer));
924 #endif
925  aligned = 1;
926  }
927 
928  if (!error) {
929  if (masterRightOf >= MIN_ALIGNMENT_RANGE && slaveRightOf >= MIN_ALIGNMENT_RANGE) {
930  CompactedSubSequence rightCompactedMaster(sequence(), masterRightStart, masterRightOf);
931  FastSearchSequence rightMaster(rightCompactedMaster);
932 
933  error = rightMaster.fast_align(CompactedSubSequence(slaveSequence, slaveRightStart, slaveRightOf),
934  alignBuffer,
935  max_seq_length, match, mismatch, report);
936  }
937  else if (slaveRightOf>0) {
938  error = cannot_fast_align(sequence(), masterRightStart, masterRightOf,
939  slaveSequence, slaveRightStart, slaveRightOf,
940  max_seq_length, alignBuffer, report);
941  }
942 
943  aligned = 1;
944  }
945 
946  }
947  }
948 
949 #endif
950 
951  if (!aligned && !error) {
952  error = cannot_fast_align(sequence(), 0, sequence().length(),
953  slaveSequence, 0, slaveSequence.length(),
954  max_seq_length, alignBuffer, report);
955  }
956 
957  return error;
958 }
959 
960 #undef dump
961 #undef TEST_BETTER_SCORE
962 #undef CAN_SCORE_LEFT
963 #undef CAN_SCORE_RIGHT
964 #undef SCORE_LEFT
965 #undef SCORE_RIGHT
966 
967 inline long calcSequenceChecksum(const char *data, long length) {
968  return GB_checksum(data, length, 1, GAP_CHARS);
969 }
970 
971 #if defined(WARN_TODO)
972 #warning firstColumn + lastColumn -> PosRange
973 #endif
974 
976  const char *ali,
977  ARB_ERROR *errorPtr,
978  char **dataPtr, // if dataPtr is specified, it will be set to the WHOLE uncompacted sequence
979  long *seqChksum, // may be NULp (of part of sequence)
980  PosRange range) // if !range.is_whole() -> return only part of the sequence
981 {
982  ARB_ERROR error = NULp;
983  GBDATA *gbd;
984  CompactedSubSequence *seq = NULp;
985 
986  gbd = GBT_find_sequence(gb_species, ali); // get sequence
987 
988  if (gbd) {
989  long length = GB_read_string_count(gbd);
990  char *data = GB_read_string(gbd);
991  long partLength;
992  char *partData;
993 
994  if (dataPtr) { // make a copy of the whole uncompacted sequence (returned to caller)
995  *dataPtr = data;
996  }
997 
998  int firstColumn = range.start();
999  if (range.is_part()) { // take only part of sequence
1000  int lastColumn = range.end();
1001 
1002  fa_assert(firstColumn>=0);
1003  fa_assert(firstColumn<=lastColumn);
1004 
1005  // include all surrounding gaps (@@@ this might cause problems with partial alignment)
1006  while (firstColumn>0 && is_ali_gap(data[firstColumn-1])) {
1007  firstColumn--;
1008  }
1009  if (lastColumn!=-1) {
1010  while (lastColumn<(length-1) && is_ali_gap(data[lastColumn+1])) lastColumn++;
1011  fa_assert(lastColumn<length);
1012  }
1013 
1014  partData = data+firstColumn;
1015  int slen = length-firstColumn;
1016 
1017  fa_assert(slen>=0);
1018  fa_assert((size_t)slen==strlen(partData));
1019 
1020  if (lastColumn==-1) { // take all till end of sequence
1021  partLength = slen;
1022  }
1023  else {
1024  partLength = lastColumn-firstColumn+1;
1025  if (partLength>slen) partLength = slen; // cut rest, if we have any
1026  }
1027  }
1028  else {
1029  partLength = length;
1030  partData = data;
1031  }
1032 
1033  if (!error) {
1034  if (seqChksum) {
1035  *seqChksum = calcSequenceChecksum(partData, partLength);
1036  }
1037 
1038  seq = new CompactedSubSequence(partData, partLength, GBT_get_name_or_description(gb_species), firstColumn);
1039  }
1040 
1041  if (!dataPtr) { // free sequence only if user has not requested to get it
1042  free(data);
1043  }
1044  }
1045  else {
1046  error = GBS_global_string("No 'data' found for species '%s'", GBT_get_name_or_description(gb_species));
1047  if (dataPtr) *dataPtr = NULp; // (user must not care to free data if we fail)
1048  }
1049 
1050  fa_assert(errorPtr);
1051  *errorPtr = error;
1052 
1053  return seq;
1054 }
1055 
1056 static ARB_ERROR writeStringToAlignment(GBDATA *gb_species, GB_CSTR alignment, GB_CSTR data_name, GB_CSTR str, bool temporary) {
1057  GBDATA *gb_ali = GB_search(gb_species, alignment, GB_DB);
1058  ARB_ERROR error = NULp;
1059  GBDATA *gb_name = GB_search(gb_ali, data_name, GB_STRING);
1060 
1061  if (gb_name) {
1062  fa_assert(GB_is_ancestor_of(gb_ali, gb_name));
1063  error = GB_write_string(gb_name, str);
1064  if (temporary && !error) error = GB_set_temporary(gb_name);
1065  }
1066  else {
1067  error = GBS_global_string("Cannot create entry '%s' for '%s'", data_name, GBT_get_name_or_description(gb_species));
1068  }
1069 
1070  return error;
1071 }
1072 
1073 // --------------------------------------------------------------------------------
1074 
1076  const FastSearchSequence *alignTo,
1077  int max_seq_length,
1078  GB_CSTR alignment,
1079  long toAlignChksum,
1080  GBDATA *gb_toAlign,
1081  GBDATA *gb_alignTo, // may be NULp
1082  const AlignParams& ali_params)
1083 {
1084  // if only part of the sequence should be aligned, then this functions already gets only the part
1085  // (i.o.w.: toAlignSequence, alignTo and toAlignChksum refer to the partial sequence)
1086  AlignBuffer alignBuffer(max_seq_length);
1087  if (ali_params.range.start()>0) {
1088  alignBuffer.set(GAP_CHAR, alignQuality(GAP_CHAR, GAP_CHAR), ali_params.range.start());
1089  }
1090 
1091  const char *master_name = read_name(gb_alignTo);
1092 
1093  FastAlignReport report(master_name, ali_params.showGapsMessages);
1094 
1095  {
1096  static GBDATA *last_gb_toAlign = NULp;
1097  if (gb_toAlign!=last_gb_toAlign) {
1098  last_gb_toAlign = gb_toAlign;
1099  currentSequenceNumber++;
1100  }
1101  }
1102 
1103 #ifdef TRACE_COMPRESSED_ALIGNMENT
1104  printf("alignCompactedTo(): master='%s' ", master_name);
1105  printf("slave='%s'\n", toAlignSequence->name());
1106 #endif // TRACE_COMPRESSED_ALIGNMENT
1107 
1108  ARB_ERROR error = GB_pop_transaction(gb_toAlign);
1109  if (!error) {
1110  if (island_hopper) {
1111  error = island_hopper->do_align();
1112  if (!error) {
1113  fa_assert(island_hopper->was_aligned());
1114 
1115 #ifdef TRACE_ISLANDHOPPER_DATA
1116  printf("Island-Hopper returns:\n");
1117  printf("maligned = '%s'\n", lstr(island_hopper->get_result_ref(), island_hopper->get_result_length()));
1118  printf("saligned = '%s'\n", lstr(island_hopper->get_result(), island_hopper->get_result_length()));
1119 #endif // TRACE_ISLANDHOPPER_DATA
1120 
1121  SequencePosition masterPos(alignTo->sequence(), 0);
1122  SequencePosition slavePos(*toAlignSequence, 0);
1123 
1124  error = insertClustalValigned(&alignBuffer, masterPos, slavePos, island_hopper->get_result_ref(), island_hopper->get_result(), island_hopper->get_result_length(), &report);
1125  }
1126  }
1127  else {
1128  error = alignTo->fast_align(*toAlignSequence, &alignBuffer, max_seq_length, 2, -10, &report); // <- here we align
1129  }
1130  }
1131 
1132  if (!error) {
1133  alignBuffer.correctUnalignedPositions();
1134  if (alignBuffer.free()) {
1135  alignBuffer.set('-', alignQuality(GAP_CHAR, GAP_CHAR), alignBuffer.free()); // rest of alignBuffer is set to '-'
1136  }
1137  alignBuffer.restoreDots(*toAlignSequence);
1138  }
1139 
1140 #ifdef TRACE_COMPRESSED_ALIGNMENT
1141  if (!error) {
1142  CompactedSubSequence alignedSlave(alignBuffer.text(), alignBuffer.length(), toAlignSequence->name());
1143  dump_n_compare("reference vs. aligned:", alignTo->sequence(), alignedSlave);
1144  }
1145 #endif // TRACE_COMPRESSED_ALIGNMENT
1146 
1147  {
1148  GB_ERROR err = GB_push_transaction(gb_toAlign);
1149  if (!error) error = err;
1150  }
1151 
1152  if (!error) {
1153  if (calcSequenceChecksum(alignBuffer.text(), alignBuffer.length())!=toAlignChksum) { // sequence-chksum changed
1154  error = "Internal aligner error (sequence checksum changed) -- aborted";
1155 
1156 #ifdef TRACE_COMPRESSED_ALIGNMENT
1157  CompactedSubSequence alignedSlave(alignBuffer.text(), alignBuffer.length(), toAlignSequence->name());
1158  dump_n_compare("Old Slave vs. new Slave", *toAlignSequence, alignedSlave);
1159 #endif // TRACE_COMPRESSED_ALIGNMENT
1160  }
1161  else {
1162  {
1163  GB_topSecurityLevel unsecured(gb_toAlign);
1164  GBDATA *gbd = GBT_add_data(gb_toAlign, alignment, "data", GB_STRING);
1165 
1166  if (!gbd) {
1167  error = "Can't find/create sequence data";
1168  }
1169  else {
1170  if (ali_params.range.is_part()) { // we aligned just a part of the sequence
1171  char *buffer = GB_read_string(gbd); // so we read old sequence data
1172  long len = GB_read_string_count(gbd);
1173  if (!buffer) error = GB_await_error();
1174  else {
1175  int lenToCopy = ali_params.range.size();
1176  long wholeChksum = calcSequenceChecksum(buffer, len);
1177 
1178  memcpy(buffer+ali_params.range.start(), alignBuffer.text()+ali_params.range.start(), lenToCopy); // copy in the aligned part
1179  // @@@ genau um 1 position zu niedrig
1180 
1181  if (calcSequenceChecksum(buffer, len) != wholeChksum) {
1182  error = "Internal aligner error (sequence checksum changed) -- aborted";
1183 # ifdef TRACE_COMPRESSED_ALIGNMENT
1184  char *buffer_org = GB_read_string(gbd);
1185  dump_n_compare("Old seq vs. new seq (slave)", buffer_org, len, buffer, len);
1186  free(buffer_org);
1187 # endif // TRACE_COMPRESSED_ALIGNMENT
1188  }
1189  else {
1190  GB_write_string(gbd, "");
1191  error = GB_write_string(gbd, buffer);
1192  }
1193  }
1194 
1195  free(buffer);
1196  }
1197  else {
1198  alignBuffer.setDotsAtEOSequence();
1199  error = GB_write_string(gbd, alignBuffer.text()); // aligned all -> write all
1200  }
1201  }
1202  }
1203 
1204  if (!error && ali_params.report != FA_NO_REPORT) {
1205  // create temp-entry for slave containing alignment quality:
1206 
1207  error = writeStringToAlignment(gb_toAlign, alignment, QUALITY_NAME, alignBuffer.quality(), ali_params.report==FA_TEMP_REPORT);
1208  if (!error) {
1209  // create temp-entry for master containing insert dots:
1210 
1211  int buflen = max_seq_length*2;
1212  char *buffer = ARB_alloc<char>(buflen+1);
1213  char *afterLast = buffer;
1214 
1215  if (!buffer) {
1216  error = "out of memory";
1217  }
1218  else {
1219  memset(buffer, '-', buflen);
1220  buffer[buflen] = 0;
1221 
1222  const FastAlignInsertion *inserts = report.insertion();
1223  while (inserts) {
1224  memset(buffer+inserts->offset(), '>', inserts->gaps());
1225  afterLast = buffer+inserts->offset()+inserts->gaps();
1226  inserts = inserts->next();
1227  }
1228 
1229  *afterLast = 0;
1230  if (gb_alignTo) {
1231  error = writeStringToAlignment(gb_alignTo, alignment, INSERTS_NAME, buffer, ali_params.report==FA_TEMP_REPORT);
1232  }
1233  }
1234  }
1235  }
1236  }
1237  }
1238  return error;
1239 }
1240 
1241 ARB_ERROR FastAligner_delete_temp_entries(GBDATA *gb_species, const char *alignment) {
1242  fa_assert(gb_species);
1243  fa_assert(alignment);
1244 
1245  GBDATA *gb_ali = GB_search(gb_species, alignment, GB_FIND);
1246  ARB_ERROR error = NULp;
1247 
1248  if (gb_ali) {
1249  GBDATA *gb_name = GB_search(gb_ali, QUALITY_NAME, GB_FIND);
1250  if (gb_name) {
1251  error = GB_delete(gb_name);
1252 #if defined(DEBUG)
1253  printf("deleted '%s' of '%s' (gb_name=%p)\n", QUALITY_NAME, GBT_get_name_or_description(gb_species), gb_name);
1254 #endif
1255  }
1256 
1257  if (!error) {
1258  gb_name = GB_search(gb_ali, INSERTS_NAME, GB_FIND);
1259  if (gb_name) {
1260  error = GB_delete(gb_name);
1261 #if defined(DEBUG)
1262  printf("deleted '%s' of '%s' (gb_name=%p)\n", INSERTS_NAME, GBT_get_name_or_description(gb_species), gb_name);
1263 #endif
1264  }
1265  }
1266  }
1267 
1268  return error;
1269 }
1270 
1271 static ARB_ERROR align_error(ARB_ERROR olderr, GBDATA *gb_toAlign, GBDATA *gb_alignTo) {
1272  // used by alignTo() and alignToNextRelative() to transform errors coming from subroutines
1273  // can be used by higher functions
1274 
1275  const char *name_toAlign = read_name(gb_toAlign);
1276  const char *name_alignTo = read_name(gb_alignTo);
1277 
1278  fa_assert(olderr);
1279 
1280  return GBS_global_string("Error while aligning '%s' to '%s':\n%s",
1281  name_toAlign, name_alignTo, olderr.deliver());
1282 }
1283 
1284 static ARB_ERROR alignTo(GBDATA *gb_toAlign,
1285  GB_CSTR alignment,
1286  const FastSearchSequence *alignTo,
1287  GBDATA *gb_alignTo, // may be NULp
1288  int max_seq_length,
1289  const AlignParams& ali_params)
1290 {
1291  ARB_ERROR error = NULp;
1292  long chksum = -1;
1293 
1294  CompactedSubSequence *toAlignSequence = readCompactedSequence(gb_toAlign, alignment, &error, NULp, &chksum, ali_params.range);
1295 
1296  if (island_hopper) {
1297  GBDATA *gb_seq = GBT_find_sequence(gb_toAlign, alignment); // get sequence
1298  if (gb_seq) {
1299  long length = GB_read_string_count(gb_seq);
1300  const char *data = GB_read_char_pntr(gb_seq);
1301 
1302  island_hopper->set_toAlign_sequence(data);
1303  island_hopper->set_alignment_length(length);
1304  }
1305  }
1306 
1307 
1308 
1309  if (!error) {
1310  error = alignCompactedTo(toAlignSequence, alignTo, max_seq_length, alignment, chksum, gb_toAlign, gb_alignTo, ali_params);
1311  if (error) error = align_error(error, gb_toAlign, gb_alignTo);
1312  delete toAlignSequence;
1313  }
1314 
1315  return error;
1316 }
1317 
1319  GB_CSTR alignment,
1320  Aligner_get_consensus_func get_consensus,
1321  int max_seq_length,
1322  const AlignParams& ali_params)
1323 {
1324  ARB_ERROR error = NULp;
1325  char *consensus = get_consensus(read_name(gb_toAlign), ali_params.range);
1326  size_t cons_len = strlen(consensus);
1327 
1328  fa_assert(cons_len);
1329 
1330  for (size_t i = 0; i<cons_len; ++i) { // translate consensus to be accepted by aligner
1331  switch (consensus[i]) {
1332  case '=': consensus[i] = '-'; break;
1333  default: break;
1334  }
1335  }
1336 
1337  CompactedSubSequence compacted(consensus, cons_len, "group consensus", ali_params.range.start());
1338 
1339  {
1340  FastSearchSequence fast(compacted);
1341  error = alignTo(gb_toAlign, alignment, &fast, NULp, max_seq_length, ali_params);
1342  }
1343 
1344  free(consensus);
1345 
1346  return error;
1347 }
1348 
1349 static void appendNameAndUsedBasePositions(char **toString, GBDATA *gb_species, int usedBasePositions) {
1350  // if usedBasePositions == -1 -> unknown;
1351 
1352  char *currInfo;
1353  if (usedBasePositions<0) {
1354  currInfo = strdup(GBT_get_name_or_description(gb_species));
1355  }
1356  else {
1357  fa_assert(usedBasePositions>0); // otherwise it should NOT be announced here!
1358  currInfo = GBS_global_string_copy("%s:%i", GBT_get_name_or_description(gb_species), usedBasePositions);
1359  }
1360 
1361  char *newString = NULp;
1362  if (*toString) {
1363  newString = GBS_global_string_copy("%s, %s", *toString, currInfo);
1364  }
1365  else {
1366  newString = currInfo;
1367  currInfo = NULp; // don't free
1368  }
1369 
1370  freeset(*toString, newString);
1371  free(currInfo);
1372 }
1373 
1374 inline int min(int i, int j) { return i<j ? i : j; }
1375 
1377  int max_seq_length,
1378  FA_turn turnAllowed,
1379  GB_CSTR alignment,
1380  GBDATA *gb_toAlign,
1381  const AlignParams& ali_params)
1382 {
1383  CompactedSubSequence *toAlignSequence = NULp;
1384  bool use_different_pt_server_alignment = 0 != strcmp(relSearch.pt_server_alignment, alignment);
1385 
1386  ARB_ERROR error;
1387  long chksum;
1388  int relativesToTest = relSearch.maxRelatives*2; // get more relatives from pt-server (needed when use_different_pt_server_alignment == true)
1389  char **nearestRelative = new char*[relativesToTest+1];
1390  int next_relatives;
1391  int i;
1392  GBDATA *gb_main = GB_get_root(gb_toAlign);
1393 
1394  if (use_different_pt_server_alignment) {
1395  turnAllowed = FA_TURN_NEVER; // makes no sense if we're using a different alignment for the pt_server
1396  }
1397 
1398  for (next_relatives=0; next_relatives<relativesToTest; next_relatives++) {
1399  nearestRelative[next_relatives] = NULp;
1400  }
1401  next_relatives = 0;
1402 
1403  bool restart = true;
1404  while (restart) {
1405  restart = false;
1406 
1407  char *findRelsBySeq = NULp;
1408  if (use_different_pt_server_alignment) {
1409  toAlignSequence = readCompactedSequence(gb_toAlign, alignment, &error, NULp, &chksum, ali_params.range);
1410 
1411  GBDATA *gbd = GBT_find_sequence(gb_toAlign, relSearch.pt_server_alignment); // use a different alignment for next relative search
1412  if (!gbd) {
1413  error = GBS_global_string("Species '%s' has no data in alignment '%s'", GBT_get_name_or_description(gb_toAlign), relSearch.pt_server_alignment);
1414  }
1415  else {
1416  findRelsBySeq = GB_read_string(gbd);
1417  }
1418  }
1419  else {
1420  toAlignSequence = readCompactedSequence(gb_toAlign, alignment, &error, &findRelsBySeq, &chksum, ali_params.range);
1421  }
1422 
1423  if (error) {
1424  delete toAlignSequence;
1425  return error; // @@@ leaks ?
1426  }
1427 
1428  while (next_relatives) {
1429  next_relatives--;
1430  freenull(nearestRelative[next_relatives]);
1431  }
1432 
1433  {
1434  // find relatives
1435  FamilyFinder *familyFinder = relSearch.getFamilyFinder();
1436  const PosRange& range = familyFinder->get_TargetRange();
1437 
1438  if (range.is_part()) {
1439  range.copy_corresponding_part(findRelsBySeq, findRelsBySeq, strlen(findRelsBySeq));
1440  turnAllowed = FA_TURN_NEVER; // makes no sense if we're using partial relative search
1441  }
1442 
1443  error = familyFinder->searchFamily(findRelsBySeq, FF_FORWARD, relativesToTest+1, 0); // @@@ make min_score configurable
1444 
1445  // @@@ case where no relative found (due to min score) handle how ? abort ? warn ?
1446 
1447  double bestScore = 0;
1448  if (!error) {
1449 #if defined(DEBUG)
1450  double lastScore = -1;
1451 #if defined(TRACE_RELATIVES)
1452  fprintf(stderr, "List of relatives used for '%s':\n", GBT_get_name_or_description(gb_toAlign));
1453 #endif // TRACE_RELATIVES
1454 #endif // DEBUG
1455  for (const FamilyList *fl = familyFinder->getFamilyList(); fl; fl=fl->next) {
1456  if (strcmp(toAlignSequence->name(), fl->name)!=0) {
1457  if (GBT_find_species(gb_main, fl->name)) { // @@@
1458  double thisScore = familyFinder->uses_rel_matches() ? fl->rel_matches : fl->matches;
1459 #if defined(DEBUG)
1460  // check whether family list is sorted correctly
1461  fa_assert(lastScore < 0 || lastScore >= thisScore);
1462  lastScore = thisScore;
1463 #if defined(TRACE_RELATIVES)
1464  fprintf(stderr, "- %s (%5.2f)\n", fl->name, thisScore);
1465 #endif // TRACE_RELATIVES
1466 #endif // DEBUG
1467  if (thisScore>=bestScore) bestScore = thisScore;
1468  if (next_relatives<(relativesToTest+1)) {
1469  nearestRelative[next_relatives] = strdup(fl->name);
1470  next_relatives++;
1471  }
1472  }
1473  }
1474  }
1475  }
1476 
1477  if (!error && turnAllowed != FA_TURN_NEVER) { // test if mirrored sequence has better relatives
1478  char *mirroredSequence = strdup(findRelsBySeq);
1479  long length = strlen(mirroredSequence);
1480  double bestMirroredScore = 0;
1481 
1482  char T_or_U;
1483  error = GBT_determine_T_or_U(global_alignmentType, &T_or_U, "reverse-complement");
1484  if (!error) {
1485  GBT_reverseComplementNucSequence(mirroredSequence, length, T_or_U);
1486  error = familyFinder->searchFamily(mirroredSequence, FF_FORWARD, relativesToTest+1, 0); // @@@ make min_score configurable
1487  }
1488  if (!error) {
1489 #if defined(DEBUG)
1490  double lastScore = -1;
1491 #if defined(TRACE_RELATIVES)
1492  fprintf(stderr, "List of relatives used for '%s' (turned seq):\n", GBT_get_name_or_description(gb_toAlign));
1493 #endif // TRACE_RELATIVES
1494 #endif // DEBUG
1495  for (const FamilyList *fl = familyFinder->getFamilyList(); fl; fl = fl->next) {
1496  double thisScore = familyFinder->uses_rel_matches() ? fl->rel_matches : fl->matches;
1497 #if defined(DEBUG)
1498  // check whether family list is sorted correctly
1499  fa_assert(lastScore < 0 || lastScore >= thisScore);
1500  lastScore = thisScore;
1501 #if defined(TRACE_RELATIVES)
1502  fprintf(stderr, "- %s (%5.2f)\n", fl->name, thisScore);
1503 #endif // TRACE_RELATIVES
1504 #endif // DEBUG
1505  if (thisScore >= bestMirroredScore) {
1506  if (strcmp(toAlignSequence->name(), fl->name)!=0) {
1507  if (GBT_find_species(gb_main, fl->name)) bestMirroredScore = thisScore; // @@@
1508  }
1509  }
1510  }
1511  }
1512 
1513  int turnIt = 0;
1514  if (bestMirroredScore>bestScore) {
1515  if (turnAllowed==FA_TURN_INTERACTIVE) {
1516  const char *message;
1517  if (familyFinder->uses_rel_matches()) {
1518  message = GBS_global_string("'%s' seems to be the other way round (score: %.1f%%, score if turned: %.1f%%)",
1519  toAlignSequence->name(), bestScore*100, bestMirroredScore*100);
1520  }
1521  else {
1522  message = GBS_global_string("'%s' seems to be the other way round (score: %li, score if turned: %li)",
1523  toAlignSequence->name(), long(bestScore+.5), long(bestMirroredScore+.5));
1524  }
1525  turnIt = aw_question("fastali_turn_sequence", message, "Turn sequence,Leave sequence alone")==0;
1526  }
1527  else {
1528  fa_assert(turnAllowed == FA_TURN_ALWAYS);
1529  turnIt = 1;
1530 #if defined(TRACE_RELATIVES)
1531  fprintf(stderr, "Using turned sequence!\n");
1532 #endif // TRACE_RELATIVES
1533  }
1534  }
1535 
1536  if (turnIt) { // write mirrored sequence
1537  GBDATA *gbd = GBT_find_sequence(gb_toAlign, alignment);
1538  GB_topSecurityLevel unsecured(gbd);
1539  error = GB_write_string(gbd, mirroredSequence);
1540  if (!error) {
1541  delete toAlignSequence;
1542  restart = true; // continue while loop
1543  }
1544  }
1545 
1546  free(mirroredSequence);
1547  }
1548  }
1549  free(findRelsBySeq);
1550  }
1551 
1552  if (!error) {
1553  if (!next_relatives) {
1554  char warning[200];
1555  sprintf(warning, "No relative found for '%s'", toAlignSequence->name());
1556  aw_message(warning);
1557  }
1558  else {
1559  // assuming relatives are sorted! (nearest to farthest)
1560 
1561  // get data pointers
1562  typedef GBDATA *GBDATAP;
1563  GBDATAP *gb_reference = new GBDATAP[relSearch.maxRelatives];
1564 
1565  {
1566  for (i=0; i<relSearch.maxRelatives && i<next_relatives; i++) {
1567  GBDATA *gb_species = GBT_find_species(gb_main, nearestRelative[i]);
1568  if (!gb_species) { // pt-server seems not up to date!
1569  error = species_not_found(nearestRelative[i]);
1570  break;
1571  }
1572 
1573  GBDATA *gb_sequence = GBT_find_sequence(gb_species, alignment);
1574  if (gb_sequence) { // if relative has sequence data in the current alignment ..
1575  gb_reference[i] = gb_species;
1576  }
1577  else { // remove this relative
1578  free(nearestRelative[i]);
1579  for (int j = i+1; j<next_relatives; ++j) {
1580  nearestRelative[j-1] = nearestRelative[j];
1581  }
1582  next_relatives--;
1583  nearestRelative[next_relatives] = NULp;
1584  i--; // redo same index
1585  }
1586  }
1587 
1588  // delete superfluous relatives
1589  for (; i<next_relatives; ++i) freenull(nearestRelative[i]);
1590 
1591  if (next_relatives>relSearch.maxRelatives) {
1592  next_relatives = relSearch.maxRelatives;
1593  }
1594  }
1595 
1596  // align
1597 
1598  if (!error) {
1599  CompactedSubSequence *alignToSequence = readCompactedSequence(gb_reference[0], alignment, &error, NULp, NULp, ali_params.range);
1600  fa_assert(alignToSequence);
1601 
1602  if (island_hopper) {
1603  GBDATA *gb_ref = GBT_find_sequence(gb_reference[0], alignment); // get reference sequence
1604  GBDATA *gb_align = GBT_find_sequence(gb_toAlign, alignment); // get sequence to align
1605 
1606  if (gb_ref && gb_align) {
1607  long length_ref = GB_read_string_count(gb_ref);
1608  const char *data;
1609 
1610  data = GB_read_char_pntr(gb_ref);
1611  island_hopper->set_ref_sequence(data);
1612 
1613  data = GB_read_char_pntr(gb_align);
1614  island_hopper->set_toAlign_sequence(data);
1615 
1616  island_hopper->set_alignment_length(length_ref);
1617  }
1618  }
1619 
1620  {
1621  FastSearchSequence referenceFastSeq(*alignToSequence);
1622 
1623  error = alignCompactedTo(toAlignSequence, &referenceFastSeq,
1624  max_seq_length, alignment, chksum,
1625  gb_toAlign, gb_reference[0], ali_params);
1626  }
1627 
1628  if (error) {
1629  error = align_error(error, gb_toAlign, gb_reference[0]);
1630  }
1631  else {
1632  char *used_relatives = NULp;
1633 
1634  if (unaligned_bases.is_empty()) {
1635  appendNameAndUsedBasePositions(&used_relatives, gb_reference[0], -1);
1636  }
1637  else {
1638  if (island_hopper) {
1639  appendNameAndUsedBasePositions(&used_relatives, gb_reference[0], -1);
1640  if (next_relatives>1) error = "Island hopping uses only one relative";
1641  }
1642  else {
1643  LooseBases loose;
1644  LooseBases loose_for_next_relative;
1645 
1646  int unaligned_positions;
1647  {
1648  CompactedSubSequence *alignedSequence = readCompactedSequence(gb_toAlign, alignment, &error, NULp, NULp, ali_params.range);
1649 
1650  unaligned_positions = loose.follow_ali_change_and_append(unaligned_bases, AliChange(*toAlignSequence, *alignedSequence));
1651  // now loose holds the unaligned (and recalculated) parts from last relative
1652  delete alignedSequence;
1653  }
1654 
1655  if (!error) {
1656  int toalign_positions = toAlignSequence->length();
1657  if (unaligned_positions<toalign_positions) {
1658  appendNameAndUsedBasePositions(&used_relatives, gb_reference[0], toalign_positions-unaligned_positions);
1659  }
1660  }
1661 
1662  for (i=1; i<next_relatives && !error; i++) {
1663  loose.append(loose_for_next_relative);
1664  int unaligned_positions_for_next = 0;
1665 
1666  while (!loose.is_empty() && !error) {
1667  ExplicitRange partRange(intersection(loose.recall(), ali_params.range));
1668  CompactedSubSequence *alignToPart = readCompactedSequence(gb_reference[i], alignment, &error, NULp, NULp, partRange);
1669 
1670  if (!error) {
1671  long part_chksum;
1672  CompactedSubSequence *toAlignPart = readCompactedSequence(gb_toAlign, alignment, &error, NULp, &part_chksum, partRange);
1673 
1674  fa_assert(contradicted(error, toAlignPart));
1675 
1676  if (!error) {
1677  AlignParams loose_ali_params = { ali_params.report, ali_params.showGapsMessages, partRange };
1678 
1679  FastSearchSequence referenceFastSeq(*alignToPart);
1680  error = alignCompactedTo(toAlignPart, &referenceFastSeq,
1681  max_seq_length, alignment, part_chksum,
1682  gb_toAlign, gb_reference[i], loose_ali_params);
1683  if (!error) {
1684  CompactedSubSequence *alignedPart = readCompactedSequence(gb_toAlign, alignment, &error, NULp, NULp, partRange);
1685  if (!error) {
1686  unaligned_positions_for_next += loose_for_next_relative.follow_ali_change_and_append(unaligned_bases, AliChange(*toAlignPart, *alignedPart));
1687  }
1688  delete alignedPart;
1689  }
1690  }
1691  delete toAlignPart;
1692  }
1693  delete alignToPart;
1694  }
1695 
1696  if (!error) {
1697  fa_assert(unaligned_positions_for_next <= unaligned_positions); // means: number of unaligned positions has increased by use of relative
1698  if (unaligned_positions_for_next<unaligned_positions) {
1699  appendNameAndUsedBasePositions(&used_relatives, gb_reference[i], unaligned_positions-unaligned_positions_for_next);
1700  unaligned_positions = unaligned_positions_for_next;
1701  }
1702  }
1703  }
1704  }
1705  }
1706 
1707  if (!error) { // write used relatives to db-field
1708  error = GBT_write_string(gb_toAlign, "used_rels", used_relatives);
1709  }
1710  free(used_relatives);
1711  }
1712 
1713  delete alignToSequence;
1714  }
1715 
1716  delete [] gb_reference;
1717  }
1718  }
1719 
1720  delete toAlignSequence;
1721 
1722  for (i=0; i<next_relatives; i++) freenull(nearestRelative[i]);
1723  delete [] nearestRelative;
1724 
1725  return error;
1726 }
1727 
1728 // ------------------------
1729 // AlignmentReference
1730 
1732  GB_CSTR alignment;
1733  int max_seq_length;
1734  const AlignParams& ali_params;
1735 
1736 public:
1738  int max_seq_length_,
1739  const AlignParams& ali_params_)
1740  : alignment(alignment_),
1741  max_seq_length(max_seq_length_),
1742  ali_params(ali_params_)
1743  {}
1744  virtual ~AlignmentReference() {}
1745 
1746  virtual ARB_ERROR align_to(GBDATA *gb_toalign) const = 0;
1747 
1748  GB_CSTR get_alignment() const { return alignment; }
1749  int get_max_seq_length() const { return max_seq_length; }
1750  const AlignParams& get_ali_params() const { return ali_params; }
1751 };
1752 
1753 
1754 #if defined(WARN_TODO)
1755 #warning make alignTo a member of ExplicitReference (or of AlignmentReference)
1756 #warning let alignToGroupConsensus and alignToNextRelative use ExplicitReference
1757 #endif
1758 
1759 class ExplicitReference: public AlignmentReference { // derived from a Noncopyable
1760  const FastSearchSequence *targetSequence;
1761  GBDATA *gb_alignTo;
1762 
1763 public:
1765  const FastSearchSequence *targetSequence_,
1766  GBDATA *gb_alignTo_,
1767  int max_seq_length_,
1768  const AlignParams& ali_params_)
1769  : AlignmentReference(alignment_, max_seq_length_, ali_params_),
1770  targetSequence(targetSequence_),
1771  gb_alignTo(gb_alignTo_)
1772  {}
1773 
1774  ARB_ERROR align_to(GBDATA *gb_toalign) const OVERRIDE {
1775  return alignTo(gb_toalign, get_alignment(), targetSequence, gb_alignTo, get_max_seq_length(), get_ali_params());
1776  }
1777 };
1778 
1779 #if defined(WARN_TODO)
1780 #warning make alignToGroupConsensus a member of ConsensusReference
1781 #endif
1782 
1784  Aligner_get_consensus_func get_consensus;
1785 
1786 public:
1788  Aligner_get_consensus_func get_consensus_,
1789  int max_seq_length_,
1790  const AlignParams& ali_params_)
1791  : AlignmentReference(alignment_, max_seq_length_, ali_params_),
1792  get_consensus(get_consensus_)
1793  {}
1794 
1795  ARB_ERROR align_to(GBDATA *gb_toalign) const OVERRIDE {
1796  return alignToGroupConsensus(gb_toalign, get_alignment(), get_consensus, get_max_seq_length(), get_ali_params());
1797  }
1798 };
1799 
1800 #if defined(WARN_TODO)
1801 #warning make alignToNextRelative a member of SearchRelativesReference
1802 #endif
1803 
1805  SearchRelativeParams& relSearch;
1806  FA_turn turnAllowed;
1807 
1808 public:
1810  int max_seq_length_,
1811  FA_turn turnAllowed_,
1812  GB_CSTR alignment_,
1813  const AlignParams& ali_params_)
1814  : AlignmentReference(alignment_, max_seq_length_, ali_params_),
1815  relSearch(relSearch_),
1816  turnAllowed(turnAllowed_)
1817  {}
1818 
1819  ARB_ERROR align_to(GBDATA *gb_toalign) const OVERRIDE {
1820  return alignToNextRelative(relSearch, get_max_seq_length(), turnAllowed, get_alignment(), gb_toalign, get_ali_params());
1821  }
1822 };
1823 
1824 
1825 // ----------------
1826 // Aligner
1827 
1828 class Aligner : virtual Noncopyable {
1829  GBDATA *gb_main;
1830 
1831  // define alignment target(s):
1832  FA_alignTarget alignWhat;
1833  GB_CSTR alignment; // name of alignment to use (if NULp -> use default)
1834  GB_CSTR toalign; // name of species to align (used if alignWhat == FA_CURRENT)
1835  Aligner_get_first_selected_species get_first_selected_species; // used if alignWhat == FA_SELECTED
1836  Aligner_get_next_selected_species get_next_selected_species; // --- "" ---
1837 
1838  // define reference sequence(s):
1839  GB_CSTR reference; // name of reference species
1840  Aligner_get_consensus_func get_consensus; // function to get consensus seq
1841  SearchRelativeParams& relSearch; // params to search for relatives
1842 
1843  // general params:
1844  FA_turn turnAllowed;
1845  const AlignParams& ali_params;
1846  int maxProtection; // protection level
1847 
1848  // -------------------- new members
1849  int wasNotAllowedToAlign; // number of failures caused by wrong protection
1850  int err_count; // count errors
1851  bool continue_on_error; /* true -> run single alignments in separate transactions.
1852  * If one target fails, continue with rest.
1853  * false -> run all in one transaction
1854  * One fails -> all fail!
1855  */
1856  FA_errorAction error_action;
1857 
1858  typedef std::list<GBDATA*> GBDATAlist;
1859  GBDATAlist species_to_mark; // species that will be marked after aligning
1860 
1861  ARB_ERROR alignToReference(GBDATA *gb_toalign, const AlignmentReference& ref);
1862  ARB_ERROR alignTargetsToReference(const AlignmentReference& ref, GBDATA *gb_species_data);
1863 
1864  ARB_ERROR alignToExplicitReference(GBDATA *gb_species_data, int max_seq_length);
1865  ARB_ERROR alignToConsensus(GBDATA *gb_species_data, int max_seq_length);
1866  ARB_ERROR alignToRelatives(GBDATA *gb_species_data, int max_seq_length);
1867 
1868  void triggerAction(GBDATA *gb_species, bool has_been_aligned) {
1869  bool mark = false;
1870  switch (error_action) {
1871  case FA_MARK_FAILED: mark = !has_been_aligned; break;
1872  case FA_MARK_ALIGNED: mark = has_been_aligned; break;
1873  case FA_NO_ACTION: mark = false; break;
1874  }
1875  if (mark) species_to_mark.push_back(gb_species);
1876  }
1877 
1878 public:
1879 
1880 #if defined(WARN_TODO)
1881 #warning pass AlignmentReference from caller (replacing reference parameters)
1882 #endif
1883 
1884  Aligner(GBDATA *gb_main_,
1885 
1886  // define alignment target(s):
1887  FA_alignTarget alignWhat_,
1888  GB_CSTR alignment_, // name of alignment to use (if NULp -> use default)
1889  GB_CSTR toalign_, // name of species to align (used if alignWhat == FA_CURRENT)
1890  Aligner_get_first_selected_species get_first_selected_species_, // used if alignWhat == FA_SELECTED
1891  Aligner_get_next_selected_species get_next_selected_species_, // --- "" ---
1892 
1893  // define reference sequence(s):
1894  GB_CSTR reference_, // name of reference species
1895  Aligner_get_consensus_func get_consensus_, // function to get consensus seq
1896  SearchRelativeParams& relSearch_, // params to search for relatives
1897 
1898  // general params:
1899  FA_turn turnAllowed_,
1900  const AlignParams& ali_params_,
1901  int maxProtection_, // protection level
1902  bool continue_on_error_,
1903  FA_errorAction error_action_)
1904  : gb_main(gb_main_),
1905  alignWhat(alignWhat_),
1906  alignment(alignment_),
1907  toalign(toalign_),
1908  get_first_selected_species(get_first_selected_species_),
1909  get_next_selected_species(get_next_selected_species_),
1910  reference(reference_),
1911  get_consensus(get_consensus_),
1912  relSearch(relSearch_),
1913  turnAllowed(turnAllowed_),
1914  ali_params(ali_params_),
1915  maxProtection(maxProtection_),
1916  wasNotAllowedToAlign(0),
1917  err_count(0),
1918  continue_on_error(continue_on_error_),
1919  error_action(continue_on_error ? error_action_ : FA_NO_ACTION)
1920  {}
1921 
1922  ARB_ERROR run();
1923 };
1924 
1925 ARB_ERROR Aligner::alignToReference(GBDATA *gb_toalign, const AlignmentReference& ref) {
1926  int myProtection = GB_read_security_write(GBT_find_sequence(gb_toalign, alignment));
1927  ARB_ERROR error;
1928 
1929  if (myProtection<=maxProtection) {
1930  // Depending on 'continue_on_error' we either
1931  // * stop aligning if an error occurs or
1932  // * run the alignment of each species in its own transaction, ignore but report errors and
1933  // optionally mark aligned or failed species.
1934 
1935  if (continue_on_error) {
1936  fa_assert(GB_get_transaction_level(gb_main) == 1);
1937  error = GB_end_transaction(gb_main, error); // end global transaction
1938  }
1939 
1940  if (!error) {
1941  error = GB_push_transaction(gb_main);
1942  if (!error) error = ref.align_to(gb_toalign);
1943  error = GB_end_transaction(gb_main, error);
1944 
1945  if (error) err_count++;
1946  triggerAction(gb_toalign, !error);
1947  }
1948 
1949  if (continue_on_error) {
1950  if (error) {
1951  GB_warning(error.deliver());
1952  error = NULp;
1953  }
1954  error = GB_begin_transaction(gb_main); // re-open global transaction
1955  }
1956  }
1957  else {
1958  wasNotAllowedToAlign++;
1959  triggerAction(gb_toalign, false);
1960  }
1961 
1962  return error;
1963 }
1964 
1965 ARB_ERROR Aligner::alignTargetsToReference(const AlignmentReference& ref, GBDATA *gb_species_data) {
1966  ARB_ERROR error;
1967 
1968  fa_assert(GB_get_transaction_level(gb_main) == 1);
1969 
1970  switch (alignWhat) {
1971  case FA_CURRENT: { // align one sequence
1972  fa_assert(toalign);
1973 
1974  GBDATA *gb_toalign = GBT_find_species_rel_species_data(gb_species_data, toalign);
1975  if (!gb_toalign) {
1976  error = species_not_found(toalign);
1977  }
1978  else {
1979  currentSequenceNumber = overallSequenceNumber = 1;
1980  error = alignToReference(gb_toalign, ref);
1981  }
1982  break;
1983  }
1984  case FA_MARKED: { // align all marked sequences
1985  int count = GBT_count_marked_species(gb_main);
1986  GBDATA *gb_species = GBT_first_marked_species_rel_species_data(gb_species_data);
1987 
1988  arb_progress progress("Aligning marked species", long(count));
1989  progress.auto_subtitles("Species");
1990 
1991  currentSequenceNumber = 1;
1992  overallSequenceNumber = count;
1993 
1994  while (gb_species && !error) {
1995  error = alignToReference(gb_species, ref);
1996  progress.inc_and_check_user_abort(error);
1997  gb_species = GBT_next_marked_species(gb_species);
1998  }
1999  break;
2000  }
2001  case FA_SELECTED: { // align all selected species
2002  int count;
2003  GBDATA *gb_species = get_first_selected_species(&count);
2004 
2005 
2006  currentSequenceNumber = 1;
2007  overallSequenceNumber = count;
2008 
2009  if (!gb_species) {
2010  aw_message("There is no selected species!");
2011  }
2012  else {
2013  arb_progress progress("Aligning selected species", long(count));
2014  progress.auto_subtitles("Species");
2015 
2016  while (gb_species && !error) {
2017  error = alignToReference(gb_species, ref);
2018  progress.inc_and_check_user_abort(error);
2019  gb_species = get_next_selected_species();
2020  }
2021  }
2022  break;
2023  }
2024  }
2025 
2026  fa_assert(GB_get_transaction_level(gb_main) == 1);
2027  return error;
2028 }
2029 
2030 ARB_ERROR Aligner::alignToExplicitReference(GBDATA *gb_species_data, int max_seq_length) {
2031  ARB_ERROR error;
2032  GBDATA *gb_reference = GBT_find_species_rel_species_data(gb_species_data, reference);
2033 
2034  if (!gb_reference) {
2035  error = species_not_found(reference);
2036  }
2037  else {
2038  long referenceChksum;
2039  CompactedSubSequence *referenceSeq = readCompactedSequence(gb_reference, alignment, &error, NULp, &referenceChksum, ali_params.range);
2040 
2041  if (island_hopper) {
2042 #if defined(WARN_TODO)
2043 #warning setting island_hopper reference has to be done in called function (seems that it is NOT done for alignToConsensus and alignToRelatives). First get tests in place!
2044 #endif
2045  GBDATA *gb_seq = GBT_find_sequence(gb_reference, alignment); // get sequence
2046  if (gb_seq) {
2047  long length = GB_read_string_count(gb_seq);
2048  const char *data = GB_read_char_pntr(gb_seq);
2049 
2050  island_hopper->set_ref_sequence(data);
2051  island_hopper->set_alignment_length(length);
2052  }
2053  }
2054 
2055 
2056  if (!error) {
2057 #if defined(WARN_TODO)
2058 #warning do not pass FastSearchSequence to ExplicitReference, instead pass sequence and length (ExplicitReference shall create it itself)
2059 #endif
2060 
2061  FastSearchSequence referenceFastSeq(*referenceSeq);
2062  ExplicitReference target(alignment, &referenceFastSeq, gb_reference, max_seq_length, ali_params);
2063 
2064  error = alignTargetsToReference(target, gb_species_data);
2065  }
2066  delete referenceSeq;
2067  }
2068  return error;
2069 }
2070 
2071 ARB_ERROR Aligner::alignToConsensus(GBDATA *gb_species_data, int max_seq_length) {
2072  return alignTargetsToReference(ConsensusReference(alignment, get_consensus, max_seq_length, ali_params),
2073  gb_species_data);
2074 }
2075 
2076 ARB_ERROR Aligner::alignToRelatives(GBDATA *gb_species_data, int max_seq_length) {
2077 
2078  return alignTargetsToReference(SearchRelativesReference(relSearch, max_seq_length, turnAllowed, alignment, ali_params),
2079  gb_species_data);
2080 }
2081 
2083  fa_assert(GB_get_transaction_level(gb_main) == 0); // no open transaction allowed here!
2084  ARB_ERROR error = GB_push_transaction(gb_main);
2085 
2086  // if neither 'reference' nor 'get_consensus' are specified -> use next relative for (each) sequence:
2087  bool search_by_pt_server = !reference && !get_consensus;
2088 
2089  err_count = 0;
2090  wasNotAllowedToAlign = 0; // incremented for every sequence which has higher protection level (and was not aligned)
2091  species_to_mark.clear();
2092 
2093  fa_assert(!reference || !get_consensus); // can't do both modes
2094 
2095  if (turnAllowed != FA_TURN_NEVER) {
2096  if ((ali_params.range.is_part()) || !search_by_pt_server) {
2097  // if not selected 'Range/Whole sequence' or not selected 'Reference/Auto search..'
2098  turnAllowed = FA_TURN_NEVER; // then disable mirroring for the current call
2099  }
2100  }
2101 
2102  if (!error && !alignment) {
2103  alignment = (GB_CSTR)GBT_get_default_alignment(gb_main); // get default alignment
2104  if (!alignment) error = "No default alignment";
2105  }
2106 
2107  if (!error && alignment) {
2108  global_alignmentType = GBT_get_alignment_type(gb_main, alignment);
2109  if (search_by_pt_server) {
2110  GB_alignment_type pt_server_alignmentType = GBT_get_alignment_type(gb_main, relSearch.pt_server_alignment);
2111 
2112  if (pt_server_alignmentType != GB_AT_RNA &&
2113  pt_server_alignmentType != GB_AT_DNA) {
2114  error = "pt_servers only support RNA/DNA sequences.\n"
2115  "In the aligner window you may specify a RNA/DNA alignment \n"
2116  "and use a pt_server build on that alignment.";
2117  }
2118  }
2119  }
2120 
2121  if (!error) {
2122  GBDATA *gb_species_data = GBT_get_species_data(gb_main);
2123  int max_seq_length = GBT_get_alignment_len(gb_main, alignment);
2124 
2125  if (reference) error = alignToExplicitReference(gb_species_data, max_seq_length);
2126  else if (get_consensus) error = alignToConsensus(gb_species_data, max_seq_length);
2127  else error = alignToRelatives(gb_species_data, max_seq_length);
2128  }
2129 
2130  ClustalV_exit();
2131  unaligned_bases.clear();
2132 
2133  error = GB_end_transaction(gb_main, error); // close global transaction
2134 
2135  if (wasNotAllowedToAlign>0) {
2136  const char *mess = GBS_global_string("%i species were not aligned (because of protection level)", wasNotAllowedToAlign);
2137  aw_message(mess);
2138  }
2139 
2140  if (err_count) {
2141  aw_message_if(error);
2142  error = GBS_global_string("Aligner produced %i error%c", err_count, err_count==1 ? '\0' : 's');
2143  }
2144 
2145  if (error_action != FA_NO_ACTION) {
2146  fa_assert(continue_on_error);
2147 
2148  GB_transaction ta(gb_main);
2149  GBT_mark_all(gb_main, 0);
2150  for (GBDATAlist::iterator sp = species_to_mark.begin(); sp != species_to_mark.end(); ++sp) {
2151  GB_write_flag(*sp, 1);
2152  }
2153 
2154  const char *whatsMarked = (error_action == FA_MARK_ALIGNED) ? "aligned" : "failed";
2155  size_t markCount = species_to_mark.size();
2156  if (markCount>0) {
2157  const char *msg = GBS_global_string("%zu %s species %s been marked",
2158  markCount,
2159  whatsMarked,
2160  (markCount == 1) ? "has" : "have");
2161  aw_message(msg);
2162  }
2163  }
2164 
2165  return error;
2166 }
2167 
2168 void FastAligner_start(AW_window *aw, const AlignDataAccess *data_access) {
2169  AW_root *root = aw->get_root();
2170  char *reference = NULp; // align against next relatives
2171  char *toalign = NULp; // align marked species
2172  ARB_ERROR error = NULp;
2173  int get_consensus = 0;
2174  int pt_server_id = -1;
2175 
2178 
2180  if (root->awar(FA_AWAR_USE_ISLAND_HOPPING)->read_int()) {
2181  island_hopper = new IslandHopping;
2182  if (root->awar(FA_AWAR_USE_SECONDARY)->read_int()) {
2183  char *helix_string = data_access->getHelixString();
2184  if (helix_string) {
2185  island_hopper->set_helix(helix_string);
2186  free(helix_string);
2187  }
2188  else {
2189  error = "Warning: No HELIX found. Can't use secondary structure";
2190  }
2191  }
2192 
2193  if (!error) {
2194  island_hopper->set_parameters(root->awar(FA_AWAR_ESTIMATE_BASE_FREQ)->read_int(),
2207  root->awar(FA_AWAR_GAP_A)->read_float(),
2208  root->awar(FA_AWAR_GAP_B)->read_float(),
2209  root->awar(FA_AWAR_GAP_C)->read_float(),
2211  );
2212  }
2213  }
2214 
2215  FA_alignTarget alignWhat = static_cast<FA_alignTarget>(root->awar(FA_AWAR_TO_ALIGN)->read_int());
2216  if (!error) {
2217  switch (alignWhat) {
2218  case FA_CURRENT: { // align current species
2219  toalign = root->awar(AWAR_SPECIES_NAME)->read_string();
2220  break;
2221  }
2222  case FA_MARKED: { // align marked species
2223  break;
2224  }
2225  case FA_SELECTED: { // align selected species
2226  get_first_selected_species = data_access->get_first_selected_species;
2227  get_next_selected_species = data_access->get_next_selected_species;
2228  break;
2229  }
2230  default: {
2231  fa_assert(0);
2232  break;
2233  }
2234  }
2235 
2236  switch (static_cast<FA_reference>(root->awar(FA_AWAR_REFERENCE)->read_int())) {
2237  case FA_REF_EXPLICIT: // align against specified species
2238  reference = root->awar(FA_AWAR_REFERENCE_NAME)->read_string();
2239  break;
2240 
2241  case FA_REF_CONSENSUS: // align against group consensus
2242  if (data_access->get_group_consensus) {
2243  get_consensus = 1;
2244  }
2245  else {
2246  error = "Can't get group consensus here.";
2247  }
2248  break;
2249 
2250  case FA_REF_RELATIVES: // align against species searched via pt_server
2251  pt_server_id = root->awar(AWAR_PT_SERVER)->read_int();
2252  if (pt_server_id<0) {
2253  error = "No pt_server selected";
2254  }
2255  break;
2256 
2257  default: fa_assert(0);
2258  break;
2259  }
2260  }
2261 
2262  RangeList ranges;
2263  bool autoRestrictRange4nextRelSearch = true;
2264 
2265  if (!error) {
2266  switch (static_cast<FA_range>(root->awar(FA_AWAR_RANGE)->read_int())) {
2267  case FA_WHOLE_SEQUENCE:
2268  autoRestrictRange4nextRelSearch = false;
2269  ranges.add(PosRange::whole());
2270  break;
2271 
2272  case FA_AROUND_CURSOR: {
2273  int curpos = root->awar(AWAR_CURSOR_POSITION_LOCAL)->read_int();
2274  int size = root->awar(FA_AWAR_AROUND)->read_int();
2275 
2276  ranges.add(PosRange(curpos-size, curpos+size));
2277  break;
2278  }
2279  case FA_SELECTED_RANGE: {
2280  PosRange range;
2281  if (!data_access->get_selected_range(range)) {
2282  error = "There is no selected species!";
2283  }
2284  else {
2285  ranges.add(range);
2286  }
2287  break;
2288  }
2289 
2290  case FA_SAI_MULTI_RANGE: {
2291  GB_transaction ta(data_access->gb_main);
2292 
2293  char *sai_name = root->awar(FA_AWAR_SAI_RANGE_NAME)->read_string();
2294  char *aliuse = GBT_get_default_alignment(data_access->gb_main);
2295 
2296  GBDATA *gb_sai = GBT_expect_SAI(data_access->gb_main, sai_name);
2297  if (!gb_sai) error = GB_await_error();
2298  else {
2299  GBDATA *gb_data = GBT_find_sequence(gb_sai, aliuse);
2300  if (!gb_data) {
2301  error = GB_have_error()
2302  ? GB_await_error()
2303  : GBS_global_string("SAI '%s' has no data in alignment '%s'", sai_name, aliuse);
2304  }
2305  else {
2306  char *sai_data = GB_read_string(gb_data); // @@@ NOT_ALL_SAI_HAVE_DATA
2307  char *set_bits = root->awar(FA_AWAR_SAI_RANGE_CHARS)->read_string();
2308 
2309  ranges = build_RangeList_from_string(sai_data, set_bits, false);
2310 
2311  free(set_bits);
2312  free(sai_data);
2313  }
2314  }
2315  free(aliuse);
2316  free(sai_name);
2317  break;
2318  }
2319  }
2320  }
2321 
2322  if (!error) {
2323  for (RangeList::iterator r = ranges.begin(); r != ranges.end() && !error; ++r) {
2324  PosRange range = *r;
2325 
2326  GBDATA *gb_main = data_access->gb_main;
2327  char *editor_alignment = NULp;
2328  long alignment_length;
2329  {
2330  GB_transaction ta(gb_main);
2331  char *default_alignment = GBT_get_default_alignment(gb_main);
2332 
2333  alignment_length = GBT_get_alignment_len(gb_main, default_alignment);
2334 
2335  editor_alignment = root->awar_string(AWAR_EDITOR_ALIGNMENT, default_alignment)->read_string();
2336  free(default_alignment);
2337 
2338  }
2339 
2340  char *pt_server_alignment = root->awar(FA_AWAR_PT_SERVER_ALIGNMENT)->read_string();
2341  PosRange relRange = PosRange::whole(); // unrestricted!
2342 
2343  if (autoRestrictRange4nextRelSearch) {
2344  AW_awar *awar_relrange = root->awar(FA_AWAR_RELATIVE_RANGE);
2345  const char *relrange = awar_relrange->read_char_pntr();
2346  if (relrange[0]) {
2347  int region_plus = atoi(relrange);
2348 
2349  fa_assert(range.is_part());
2350 
2351  relRange = PosRange(range.start()-region_plus, range.end()+region_plus); // restricted
2352  awar_relrange->write_as_string(GBS_global_string("%i", region_plus)); // set awar to detected value (avoid misbehavior when it contains ' ' or similar)
2353  }
2354  }
2355 
2356  if (island_hopper) {
2357  island_hopper->set_range(ExplicitRange(range, alignment_length));
2358  range = PosRange::whole();
2359  }
2360 
2361  SearchRelativeParams relSearch(new PT_FamilyFinder(gb_main,
2362  pt_server_id,
2363  root->awar(AWAR_NN_OLIGO_LEN)->read_int(),
2364  root->awar(AWAR_NN_MISMATCHES)->read_int(),
2365  root->awar(AWAR_NN_FAST_MODE)->read_int(),
2366  root->awar(AWAR_NN_REL_MATCHES)->read_int(),
2367  RSS_BOTH_MIN), // old scaling as b4 [8520] @@@ make configurable
2368  pt_server_alignment,
2370 
2371  relSearch.getFamilyFinder()->restrict_2_region(relRange);
2372 
2373  struct AlignParams ali_params = {
2374  static_cast<FA_report>(root->awar(FA_AWAR_REPORT)->read_int()),
2375  bool(root->awar(FA_AWAR_SHOW_GAPS_MESSAGES)->read_int()),
2376  range
2377  };
2378 
2379  {
2380  arb_progress progress("FastAligner");
2381  progress.allow_title_reuse();
2382 
2383  int cont_on_error = root->awar(FA_AWAR_CONTINUE_ON_ERROR)->read_int();
2384 
2385  Aligner aligner(gb_main,
2386  alignWhat,
2387  editor_alignment,
2388  toalign,
2389  get_first_selected_species,
2390  get_next_selected_species,
2391 
2392  reference,
2393  get_consensus ? data_access->get_group_consensus : NULp,
2394  relSearch,
2395 
2396  static_cast<FA_turn>(root->awar(FA_AWAR_MIRROR)->read_int()),
2397  ali_params,
2398  root->awar(FA_AWAR_PROTECTION)->read_int(),
2399  cont_on_error,
2401  error = aligner.run();
2402 
2403  if (error && cont_on_error) {
2404  aw_message_if(error);
2405  error = NULp;
2406  }
2407  }
2408 
2409  free(pt_server_alignment);
2410  free(editor_alignment);
2411  }
2412  }
2413 
2414  if (island_hopper) {
2415  delete island_hopper;
2416  island_hopper = NULp;
2417  }
2418 
2419  if (toalign) free(toalign);
2420  aw_message_if(error);
2421  if (data_access->do_refresh) data_access->refresh_display();
2422 }
2423 
2424 static void nullcb() { }
2425 
2427  root->awar_string(FA_AWAR_REFERENCE_NAME, "", db1);
2428 
2429  root->awar_int(FA_AWAR_TO_ALIGN, FA_CURRENT, db1);
2432 
2433  AW_awar *ali_protect = root->awar_int(FA_AWAR_PROTECTION, 0, db1);
2434  if (ARB_in_novice_mode(root)) {
2435  ali_protect->write_int(0); // reset protection for noobs
2436  }
2437 
2438  root->awar_int(FA_AWAR_AROUND, 25, db1);
2440  root->awar_int(FA_AWAR_REPORT, FA_NO_REPORT, db1);
2441  root->awar_int(FA_AWAR_SHOW_GAPS_MESSAGES, 1, db1);
2442  root->awar_int(FA_AWAR_CONTINUE_ON_ERROR, 1, db1);
2444  root->awar_int(FA_AWAR_USE_SECONDARY, 0, db1);
2445  root->awar_int(AWAR_PT_SERVER, 0, db1);
2446  root->awar_int(FA_AWAR_NEXT_RELATIVES, 1, db1)->set_minmax(1, 100);
2447 
2448  root->awar_string(FA_AWAR_RELATIVE_RANGE, "", db1);
2450 
2451  root->awar_string(FA_AWAR_SAI_RANGE_NAME, "", db1);
2452  root->awar_string(FA_AWAR_SAI_RANGE_CHARS, "x1", db1);
2453 
2454  // island hopping:
2455 
2456  root->awar_int(FA_AWAR_USE_ISLAND_HOPPING, 0, db1);
2457 
2458  root->awar_int(FA_AWAR_ESTIMATE_BASE_FREQ, 1, db1);
2459 
2460  root->awar_float(FA_AWAR_BASE_FREQ_A, 0.25, db1);
2461  root->awar_float(FA_AWAR_BASE_FREQ_C, 0.25, db1);
2462  root->awar_float(FA_AWAR_BASE_FREQ_G, 0.25, db1);
2463  root->awar_float(FA_AWAR_BASE_FREQ_T, 0.25, db1);
2464 
2465  root->awar_float(FA_AWAR_SUBST_PARA_AC, 1.0, db1);
2466  root->awar_float(FA_AWAR_SUBST_PARA_AG, 4.0, db1);
2467  root->awar_float(FA_AWAR_SUBST_PARA_AT, 1.0, db1);
2468  root->awar_float(FA_AWAR_SUBST_PARA_CG, 1.0, db1);
2469  root->awar_float(FA_AWAR_SUBST_PARA_CT, 4.0, db1);
2470  root->awar_float(FA_AWAR_SUBST_PARA_GT, 1.0, db1);
2471 
2472  root->awar_float(FA_AWAR_EXPECTED_DISTANCE, 0.3, db1);
2473  root->awar_float(FA_AWAR_STRUCTURE_SUPPLEMENT, 0.5, db1);
2474  root->awar_float(FA_AWAR_THRESHOLD, 0.005, db1);
2475 
2476  root->awar_float(FA_AWAR_GAP_A, 8.0, db1);
2477  root->awar_float(FA_AWAR_GAP_B, 4.0, db1);
2478  root->awar_float(FA_AWAR_GAP_C, 7.0, db1);
2479 
2480  AWTC_create_common_next_neighbour_vars(root, makeRootCallback(nullcb));
2481 }
2482 
2484  root->awar_int(FA_AWAR_TO_ALIGN, FA_CURRENT, db1);
2485 }
2486 
2488  // sets the aligner reference species to current species
2489  char *specName = root->awar(AWAR_SPECIES_NAME)->read_string();
2490  root->awar(FA_AWAR_REFERENCE_NAME)->write_string(specName);
2491  free(specName);
2492 }
2493 
2495  AW_window_simple *aws = new AW_window_simple;
2496 
2497  aws->init(root, "ISLAND_HOPPING_PARA", "Parameters for Island Hopping");
2498  aws->load_xfig("faligner/islandhopping.fig");
2499 
2500  aws->at("close");
2501  aws->callback(AW_POPDOWN);
2502  aws->create_button("CLOSE", "CLOSE", "O");
2503 
2504  aws->at("help");
2505  aws->callback(makeHelpCallback("islandhopping.hlp"));
2506  aws->create_button("HELP", "HELP");
2507 
2508  aws->at("use_secondary");
2509  aws->label("Use secondary structure (only for re-align)");
2510  aws->create_toggle(FA_AWAR_USE_SECONDARY);
2511 
2512  aws->at("freq");
2513  aws->create_toggle_field(FA_AWAR_ESTIMATE_BASE_FREQ, "Base freq.", "B");
2514  aws->insert_default_toggle("Estimate", "E", 1);
2515  aws->insert_toggle("Define here: ", "D", 0);
2516  aws->update_toggle_field();
2517 
2518  aws->at("freq_a"); aws->label("A:"); aws->create_input_field(FA_AWAR_BASE_FREQ_A, 4);
2519  aws->at("freq_c"); aws->label("C:"); aws->create_input_field(FA_AWAR_BASE_FREQ_C, 4);
2520  aws->at("freq_g"); aws->label("G:"); aws->create_input_field(FA_AWAR_BASE_FREQ_G, 4);
2521  aws->at("freq_t"); aws->label("T:"); aws->create_input_field(FA_AWAR_BASE_FREQ_T, 4);
2522 
2523  int xpos[4], ypos[4];
2524  {
2525  aws->button_length(1);
2526 
2527  int dummy;
2528  aws->at("h_a"); aws->get_at_position(&xpos[0], &dummy); aws->create_button(NULp, "A");
2529  aws->at("h_c"); aws->get_at_position(&xpos[1], &dummy); aws->create_button(NULp, "C");
2530  aws->at("h_g"); aws->get_at_position(&xpos[2], &dummy); aws->create_button(NULp, "G");
2531  aws->at("h_t"); aws->get_at_position(&xpos[3], &dummy); aws->create_button(NULp, "T");
2532 
2533  aws->at("v_a"); aws->get_at_position(&dummy, &ypos[0]); aws->create_button(NULp, "A");
2534  aws->at("v_c"); aws->get_at_position(&dummy, &ypos[1]); aws->create_button(NULp, "C");
2535  aws->at("v_g"); aws->get_at_position(&dummy, &ypos[2]); aws->create_button(NULp, "G");
2536  aws->at("v_t"); aws->get_at_position(&dummy, &ypos[3]); aws->create_button(NULp, "T");
2537  }
2538 
2539  aws->at("subst"); aws->create_button(NULp, "Substitution rate parameters:");
2540 
2541 #define XOFF -25
2542 #define YOFF 0
2543 
2544  aws->at(xpos[1]+XOFF, ypos[0]+YOFF); aws->create_input_field(FA_AWAR_SUBST_PARA_AC, 4);
2545  aws->at(xpos[2]+XOFF, ypos[0]+YOFF); aws->create_input_field(FA_AWAR_SUBST_PARA_AG, 4);
2546  aws->at(xpos[3]+XOFF, ypos[0]+YOFF); aws->create_input_field(FA_AWAR_SUBST_PARA_AT, 4);
2547  aws->at(xpos[2]+XOFF, ypos[1]+YOFF); aws->create_input_field(FA_AWAR_SUBST_PARA_CG, 4);
2548  aws->at(xpos[3]+XOFF, ypos[1]+YOFF); aws->create_input_field(FA_AWAR_SUBST_PARA_CT, 4);
2549  aws->at(xpos[3]+XOFF, ypos[2]+YOFF); aws->create_input_field(FA_AWAR_SUBST_PARA_GT, 4);
2550 
2551 #undef XOFF
2552 #undef YOFF
2553 
2554  aws->label_length(22);
2555 
2556  aws->at("dist");
2557  aws->label("Expected distance");
2558  aws->create_input_field(FA_AWAR_EXPECTED_DISTANCE, 5);
2559 
2560  aws->at("supp");
2561  aws->label("Structure supplement");
2562  aws->create_input_field(FA_AWAR_STRUCTURE_SUPPLEMENT, 5);
2563 
2564  aws->at("thres");
2565  aws->label("Threshold");
2566  aws->create_input_field(FA_AWAR_THRESHOLD, 5);
2567 
2568  aws->label_length(10);
2569 
2570  aws->at("gapA");
2571  aws->label("Gap A");
2572  aws->create_input_field(FA_AWAR_GAP_A, 5);
2573 
2574  aws->at("gapB");
2575  aws->label("Gap B");
2576  aws->create_input_field(FA_AWAR_GAP_B, 5);
2577 
2578  aws->at("gapC");
2579  aws->label("Gap C");
2580  aws->create_input_field(FA_AWAR_GAP_C, 5);
2581 
2582  return aws;
2583 }
2584 
2586  static AW_window_simple *aws = NULp;
2587 
2588  if (!aws) {
2589  aws = new AW_window_simple;
2590 
2591  aws->init(root, "FAMILY_PARAMS", "Family search parameters");
2592  aws->load_xfig("faligner/family_settings.fig");
2593 
2594  aws->at("close");
2595  aws->callback(AW_POPDOWN);
2596  aws->create_button("CLOSE", "CLOSE", "O");
2597 
2598  aws->at("help");
2599  aws->callback(makeHelpCallback("next_neighbours_common.hlp"));
2600  aws->create_button("HELP", "HELP");
2601 
2602  aws->auto_space(5, 5);
2604  }
2605 
2606  return aws;
2607 }
2608 
2610  { FA_AWAR_USE_ISLAND_HOPPING, "island" },
2611  { FA_AWAR_TO_ALIGN, "target" },
2612  { FA_AWAR_REFERENCE, "ref" },
2613  { FA_AWAR_REFERENCE_NAME, "refname" },
2614  { FA_AWAR_RELATIVE_RANGE, "relrange" },
2615  { FA_AWAR_NEXT_RELATIVES, "relatives" },
2616  { FA_AWAR_PT_SERVER_ALIGNMENT, "ptali" },
2617 
2618  // relative-search specific parameters from subwindow (create_family_settings_window)
2619  // same as ../DB_UI/ui_species.cxx@RELATIVES_CONFIG
2620  { AWAR_NN_OLIGO_LEN, "oligolen" },
2621  { AWAR_NN_MISMATCHES, "mismatches" },
2622  { AWAR_NN_FAST_MODE, "fastmode" },
2623  { AWAR_NN_REL_MATCHES, "relmatches" },
2624  { AWAR_NN_REL_SCALING, "relscaling" },
2625 
2626  // island-hopping parameters (create_island_hopping_window)
2627  { FA_AWAR_USE_SECONDARY, "use2nd" },
2628  { FA_AWAR_ESTIMATE_BASE_FREQ, "estbasefreq" },
2629  { FA_AWAR_BASE_FREQ_A, "freq_a" },
2630  { FA_AWAR_BASE_FREQ_C, "freq_c" },
2631  { FA_AWAR_BASE_FREQ_G, "freq_g" },
2632  { FA_AWAR_BASE_FREQ_T, "freq_t" },
2633  { FA_AWAR_SUBST_PARA_AC, "subst_ac" },
2634  { FA_AWAR_SUBST_PARA_AG, "subst_ag" },
2635  { FA_AWAR_SUBST_PARA_AT, "subst_at" },
2636  { FA_AWAR_SUBST_PARA_CG, "subst_cg" },
2637  { FA_AWAR_SUBST_PARA_CT, "subst_ct" },
2638  { FA_AWAR_SUBST_PARA_GT, "subst_gt" },
2639  { FA_AWAR_EXPECTED_DISTANCE, "distance" },
2640  { FA_AWAR_STRUCTURE_SUPPLEMENT, "supplement" },
2641  { FA_AWAR_THRESHOLD, "threshold" },
2642  { FA_AWAR_GAP_A, "gap_a" },
2643  { FA_AWAR_GAP_B, "gap_b" },
2644  { FA_AWAR_GAP_C, "gap_c" },
2645 
2646  { NULp, NULp }
2647 };
2648 
2650  AW_window_simple *aws = new AW_window_simple;
2651 
2652  aws->init(root, "INTEGRATED_ALIGNERS", INTEGRATED_ALIGNERS_TITLE);
2653  aws->load_xfig("faligner/faligner.fig");
2654 
2655  aws->label_length(10);
2656  aws->button_length(10);
2657 
2658  aws->at("close");
2659  aws->callback(AW_POPDOWN);
2660  aws->create_button("CLOSE", "CLOSE", "O");
2661 
2662  aws->at("help");
2663  aws->callback(makeHelpCallback("faligner.hlp"));
2664  aws->create_button("HELP", "HELP");
2665 
2666  aws->at("aligner");
2667  aws->create_toggle_field(FA_AWAR_USE_ISLAND_HOPPING, "Aligner", "A");
2668  aws->insert_default_toggle("Fast aligner", "F", 0);
2669  aws->sens_mask(AWM_EXP);
2670  aws->insert_toggle ("Island Hopping", "I", 1);
2671  aws->sens_mask(AWM_ALL);
2672  aws->update_toggle_field();
2673 
2674  aws->button_length(12);
2675  aws->at("island_para");
2676  aws->callback(create_island_hopping_window);
2677  aws->sens_mask(AWM_EXP);
2678  aws->create_button("island_para", "Parameters", "");
2679  aws->sens_mask(AWM_ALL);
2680 
2681  aws->button_length(10);
2682 
2683  aws->at("rev_compl");
2684  aws->callback(makeWindowCallback(build_reverse_complement, data_access));
2685  aws->create_button("reverse_complement", "Turn now!", "");
2686 
2687  aws->at("what");
2688  aws->create_toggle_field(FA_AWAR_TO_ALIGN, "Align what?", "A");
2689  aws->insert_toggle ("Current Species:", "A", FA_CURRENT);
2690  aws->insert_default_toggle("Marked Species", "M", FA_MARKED);
2691  aws->insert_toggle ("Selected Species", "S", FA_SELECTED);
2692  aws->update_toggle_field();
2693 
2694  aws->at("swhat");
2695  aws->create_input_field(AWAR_SPECIES_NAME, 2);
2696 
2697  aws->at("against");
2698  aws->create_toggle_field(FA_AWAR_REFERENCE, "Reference", "");
2699  aws->insert_toggle ("Species by name:", "S", FA_REF_EXPLICIT);
2700  aws->insert_toggle ("Group consensus", "K", FA_REF_CONSENSUS);
2701  aws->insert_default_toggle("Auto search by pt_server:", "A", FA_REF_RELATIVES);
2702  aws->update_toggle_field();
2703 
2704  aws->at("sagainst");
2705  aws->create_input_field(FA_AWAR_REFERENCE_NAME, 2);
2706 
2707  aws->at("copy");
2709  aws->create_button("Copy", "Copy", "");
2710 
2711  aws->label_length(0);
2712  aws->at("pt_server");
2714 
2715  aws->label_length(23);
2716  aws->at("relrange");
2717  aws->label("Data from range only, plus");
2718  aws->create_input_field(FA_AWAR_RELATIVE_RANGE, 3);
2719 
2720  aws->at("relatives");
2721  aws->label("Number of relatives to use");
2722  aws->create_input_field(FA_AWAR_NEXT_RELATIVES, 3);
2723 
2724  aws->label_length(9);
2725  aws->at("use_ali");
2726  aws->label("Alignment");
2727  aws->create_input_field(FA_AWAR_PT_SERVER_ALIGNMENT, 12);
2728 
2729  aws->at("relSett");
2730  aws->callback(create_family_settings_window);
2731  aws->create_autosize_button("Settings", "More settings", "");
2732 
2733  // Range
2734 
2735  aws->label_length(10);
2736  aws->at("range");
2737  aws->create_toggle_field(FA_AWAR_RANGE, "Range", "");
2738  aws->insert_default_toggle("Whole sequence", "", FA_WHOLE_SEQUENCE);
2739  aws->insert_toggle ("Positions around cursor: ", "", FA_AROUND_CURSOR);
2740  aws->insert_toggle ("Selected range", "", FA_SELECTED_RANGE);
2741  aws->insert_toggle ("Multi-Range by SAI", "", FA_SAI_MULTI_RANGE);
2742  aws->update_toggle_field();
2743 
2744  aws->at("around");
2745  aws->create_input_field(FA_AWAR_AROUND, 2);
2746 
2747  aws->at("sai");
2749 
2750  aws->at("rchars");
2751  aws->create_input_field(FA_AWAR_SAI_RANGE_CHARS, 2);
2752 
2753  // Protection
2754 
2755  aws->at("protection");
2756  aws->label("Protection");
2757  aws->create_option_menu(FA_AWAR_PROTECTION, true);
2758  aws->insert_default_option("0", NULp, 0);
2759  aws->insert_option ("1", NULp, 1);
2760  aws->insert_option ("2", NULp, 2);
2761  aws->insert_option ("3", NULp, 3);
2762  aws->insert_option ("4", NULp, 4);
2763  aws->insert_option ("5", NULp, 5);
2764  aws->insert_option ("6", NULp, 6);
2765  aws->update_option_menu();
2766 
2767  // MirrorCheck
2768 
2769  aws->at("mirror");
2770  aws->label("Turn check");
2771  aws->create_option_menu(FA_AWAR_MIRROR, true);
2772  aws->insert_option ("Never turn sequence", "", FA_TURN_NEVER);
2773  aws->insert_default_option("User acknowledgment ", "", FA_TURN_INTERACTIVE);
2774  aws->insert_option ("Automatically turn sequence", "", FA_TURN_ALWAYS);
2775  aws->update_option_menu();
2776 
2777  // Report
2778 
2779  aws->at("insert");
2780  aws->label("Report");
2781  aws->create_option_menu(FA_AWAR_REPORT, true);
2782  aws->insert_option ("No report", "", FA_NO_REPORT);
2783  aws->sens_mask(AWM_EXP);
2784  aws->insert_default_option("Report to temporary entries", "", FA_TEMP_REPORT);
2785  aws->insert_option ("Report to resident entries", "", FA_REPORT);
2786  aws->sens_mask(AWM_ALL);
2787  aws->update_option_menu();
2788 
2789  aws->at("gaps");
2790  aws->create_toggle(FA_AWAR_SHOW_GAPS_MESSAGES);
2791 
2792  aws->at("continue");
2793  aws->create_toggle(FA_AWAR_CONTINUE_ON_ERROR);
2794 
2795  aws->at("on_failure");
2796  aws->label("On failure");
2797  aws->create_option_menu(FA_AWAR_ACTION_ON_ERROR, true);
2798  aws->insert_default_option("do nothing", "", FA_NO_ACTION);
2799  aws->insert_option ("mark failed", "", FA_MARK_FAILED);
2800  aws->insert_option ("mark aligned", "", FA_MARK_ALIGNED);
2801  aws->update_option_menu();
2802 
2803  aws->at("align");
2804  aws->callback(makeWindowCallback(FastAligner_start, data_access));
2805  aws->highlight();
2806  aws->create_button("GO", "GO", "G");
2807 
2808  aws->at("config");
2809  AWT_insert_config_manager(aws, AW_ROOT_DEFAULT, "aligner", aligner_config_mapping);
2810 
2811  return aws;
2812 }
2813 
2814 // --------------------------------------------------------------------------------
2815 
2816 #ifdef UNIT_TESTS
2817 
2818 #include <test_unit.h>
2819 
2820 // ---------------------
2821 // OligoCounter
2822 
2823 #include <map>
2824 #include <string>
2825 
2826 using std::map;
2827 using std::string;
2828 
2829 typedef map<string, size_t> OligoCount;
2830 
2831 class OligoCounter {
2832  size_t oligo_len;
2833  size_t datasize;
2834 
2835  mutable OligoCount occurrence;
2836 
2837  static string removeGaps(const char *seq) {
2838  size_t len = strlen(seq);
2839  string nogaps;
2840  nogaps.reserve(len);
2841 
2842  for (size_t p = 0; p<len; ++p) {
2843  char c = seq[p];
2844  if (!is_gap(c)) nogaps.append(1, c);
2845  }
2846  return nogaps;
2847  }
2848 
2849  void count_oligos(const string& seq) {
2850  occurrence.clear();
2851  size_t max_pos = seq.length()-oligo_len;
2852  for (size_t p = 0; p <= max_pos; ++p) {
2853  string oligo(seq, p, oligo_len);
2854  occurrence[oligo]++;
2855  }
2856  }
2857 
2858 public:
2859  OligoCounter()
2860  : oligo_len(0),
2861  datasize(0)
2862  {}
2863  OligoCounter(const char *seq, size_t oligo_len_)
2864  : oligo_len(oligo_len_)
2865  {
2866  string seq_nogaps = removeGaps(seq);
2867  datasize = seq_nogaps.length();
2868  count_oligos(seq_nogaps);
2869  }
2870 
2871  size_t oligo_count(const char *oligo) {
2872  fa_assert(strlen(oligo) == oligo_len);
2873  return occurrence[oligo];
2874  }
2875 
2876  size_t similarity_score(const OligoCounter& other) const {
2877  size_t score = 0;
2878  if (oligo_len == other.oligo_len) {
2879  for (OligoCount::const_iterator o = occurrence.begin(); o != occurrence.end(); ++o) {
2880  const string& oligo = o->first;
2881  size_t count = o->second;
2882 
2883  score += min(count, other.occurrence[oligo]);
2884  }
2885  }
2886  return score;
2887  }
2888 
2889  size_t getDataSize() const { return datasize; }
2890 };
2891 
2892 void TEST_OligoCounter() {
2893  OligoCounter oc1("CCAGGT", 3);
2894  OligoCounter oc2("GGTCCA", 3);
2895  OligoCounter oc2_gaps("..GGT--CCA..", 3);
2896  OligoCounter oc3("AGGTCC", 3);
2897  OligoCounter oc4("AGGTCCAGG", 3);
2898 
2899  TEST_EXPECT_EQUAL(oc1.oligo_count("CCA"), 1);
2900  TEST_EXPECT_ZERO(oc1.oligo_count("CCG"));
2901  TEST_EXPECT_EQUAL(oc4.oligo_count("AGG"), 2);
2902 
2903  int sc1_2 = oc1.similarity_score(oc2);
2904  int sc2_1 = oc2.similarity_score(oc1);
2905  TEST_EXPECT_EQUAL(sc1_2, sc2_1);
2906 
2907  int sc1_2gaps = oc1.similarity_score(oc2_gaps);
2908  TEST_EXPECT_EQUAL(sc1_2, sc1_2gaps);
2909 
2910  int sc1_3 = oc1.similarity_score(oc3);
2911  int sc2_3 = oc2.similarity_score(oc3);
2912  int sc3_4 = oc3.similarity_score(oc4);
2913 
2914  TEST_EXPECT_EQUAL(sc1_2, 2); // common oligos (CCA GGT)
2915  TEST_EXPECT_EQUAL(sc1_3, 2); // common oligos (AGG GGT)
2916  TEST_EXPECT_EQUAL(sc2_3, 3); // common oligos (GGT GTC TCC)
2917 
2918  TEST_EXPECT_EQUAL(sc3_4, 4);
2919 }
2920 
2921 // -------------------------
2922 // FakeFamilyFinder
2923 
2924 class FakeFamilyFinder: public FamilyFinder { // derived from a Noncopyable
2925  // used by unit tests to detect next relatives instead of asking the pt-server
2926 
2927  GBDATA *gb_main;
2928  string ali_name;
2929  map<string, OligoCounter> oligos_counted; // key = species name
2930  PosRange counted_for_range;
2931  size_t oligo_len;
2932 
2933 public:
2934  FakeFamilyFinder(GBDATA *gb_main_, string ali_name_, bool rel_matches_, size_t oligo_len_)
2935  : FamilyFinder(rel_matches_, RSS_BOTH_MIN),
2936  gb_main(gb_main_),
2937  ali_name(ali_name_),
2938  counted_for_range(PosRange::whole()),
2939  oligo_len(oligo_len_)
2940  {}
2941 
2942  GB_ERROR searchFamily(const char *sequence, FF_complement compl_mode, int max_results, double min_score) OVERRIDE {
2943  // 'sequence' has to contain full sequence or part corresponding to 'range'
2944 
2945  TEST_EXPECT_EQUAL(compl_mode, FF_FORWARD); // not fit for other modes
2946 
2948 
2949  OligoCounter seq_oligo_count(sequence, oligo_len);
2950 
2951  if (range != counted_for_range) {
2952  oligos_counted.clear(); // forget results for different range
2953  counted_for_range = range;
2954  }
2955 
2956  char *buffer = NULp;
2957  int buffersize = 0;
2958 
2959  bool partial_match = range.is_part();
2960 
2961  GB_transaction ta(gb_main);
2962  int results = 0;
2963 
2964  for (GBDATA *gb_species = GBT_first_species(gb_main);
2965  gb_species && results<max_results;
2966  gb_species = GBT_next_species(gb_species))
2967  {
2968  string name = GBT_get_name_or_description(gb_species);
2969  if (oligos_counted.find(name) == oligos_counted.end()) {
2970  GBDATA *gb_data = GBT_find_sequence(gb_species, ali_name.c_str());
2971  const char *spec_seq = GB_read_char_pntr(gb_data);
2972 
2973  if (partial_match) {
2974  int spec_seq_len = GB_read_count(gb_data);
2975  int range_len = ExplicitRange(range, spec_seq_len).size();
2976 
2977  if (buffersize<range_len) {
2978  delete [] buffer;
2979  buffersize = range_len;
2980  buffer = new char[buffersize+1];
2981  }
2982 
2983  range.copy_corresponding_part(buffer, spec_seq, spec_seq_len);
2984  oligos_counted[name] = OligoCounter(buffer, oligo_len);
2985  }
2986  else {
2987  oligos_counted[name] = OligoCounter(spec_seq, oligo_len);
2988  }
2989  }
2990 
2991  const OligoCounter& spec_oligo_count = oligos_counted[name];
2992  size_t score = seq_oligo_count.similarity_score(spec_oligo_count);
2993 
2994  if (score>=min_score) {
2995  FamilyList *newMember = new FamilyList;
2996 
2997  newMember->name = strdup(name.c_str());
2998  newMember->matches = score;
2999  newMember->rel_matches = score/spec_oligo_count.getDataSize();
3000  newMember->next = NULp;
3001 
3002  family_list = newMember->insertSortedBy_matches(family_list);
3003  results++;
3004  }
3005  }
3006 
3007  delete [] buffer;
3008 
3009  return NULp;
3010  }
3011 };
3012 
3013 // ----------------------------
3014 // test_alignment_data
3015 
3016 #include <arb_unit_test.h>
3017 
3018 static const char *test_aliname = "ali_test";
3019 
3020 static const char *get_aligned_data_of(GBDATA *gb_main, const char *species_name) {
3021  GB_transaction ta(gb_main);
3022  ARB_ERROR error;
3023  const char *data = NULp;
3024 
3025  GBDATA *gb_species = GBT_find_species(gb_main, species_name);
3026  if (!gb_species) error = GB_await_error();
3027  else {
3028  GBDATA *gb_data = GBT_find_sequence(gb_species, test_aliname);
3029  if (!gb_data) error = GB_await_error();
3030  else {
3031  data = GB_read_char_pntr(gb_data);
3032  if (!data) error = GB_await_error();
3033  }
3034  }
3035 
3036  TEST_EXPECT_NULL(error.deliver());
3037 
3038  return data;
3039 }
3040 
3041 static const char *get_used_rels_for(GBDATA *gb_main, const char *species_name) {
3042  GB_transaction ta(gb_main);
3043  const char *result = NULp;
3044  GBDATA *gb_species = GBT_find_species(gb_main, species_name);
3045  if (!gb_species) result = GBS_global_string("<No such species '%s'>", species_name);
3046  else {
3047  GBDATA *gb_used_rels = GB_search(gb_species, "used_rels", GB_FIND);
3048  if (!gb_used_rels) result = "<No such field 'used_rels'>";
3049  else result = GB_read_char_pntr(gb_used_rels);
3050  }
3051  return result;
3052 }
3053 
3054 static GB_ERROR forget_used_relatives(GBDATA *gb_main) {
3055  GB_ERROR error = NULp;
3056  GB_transaction ta(gb_main);
3057  for (GBDATA *gb_species = GBT_first_species(gb_main);
3058  gb_species && !error;
3059  gb_species = GBT_next_species(gb_species))
3060  {
3061  GBDATA *gb_used_rels = GB_search(gb_species, "used_rels", GB_FIND);
3062  if (gb_used_rels) {
3063  error = GB_delete(gb_used_rels);
3064  }
3065  }
3066  return error;
3067 }
3068 
3069 
3070 #define ALIGNED_DATA_OF(name) get_aligned_data_of(gb_main, name)
3071 #define USED_RELS_FOR(name) get_used_rels_for(gb_main, name)
3072 
3073 // ----------------------------------------
3074 
3075 static GBDATA *selection_fake_gb_main = NULp;
3076 static GBDATA *selection_fake_gb_last = NULp;
3077 
3078 static GBDATA *fake_first_selected(int *count) {
3079  selection_fake_gb_last = NULp;
3080  *count = 2; // we fake two species as selected ("c1" and "c2")
3081  return GBT_find_species(selection_fake_gb_main, "c1");
3082 }
3083 static GBDATA *fake_next_selected() {
3084  if (!selection_fake_gb_last) {
3085  selection_fake_gb_last = GBT_find_species(selection_fake_gb_main, "c2");
3086  }
3087  else {
3088  selection_fake_gb_last = NULp;
3089  }
3090  return selection_fake_gb_last;
3091 }
3092 
3093 static char *fake_get_consensus(const char*, PosRange range) {
3094  const char *data = get_aligned_data_of(selection_fake_gb_main, "s1");
3095  if (range.is_whole()) return strdup(data);
3096  return ARB_strpartdup(data+range.start(), data+range.end());
3097 }
3098 
3099 static void test_install_fakes(GBDATA *gb_main) {
3100  selection_fake_gb_main = gb_main;
3101 }
3102 
3103 
3104 // ----------------------------------------
3105 
3106 static AlignParams test_ali_params = {
3107  FA_NO_REPORT,
3108  false, // showGapsMessages
3109  PosRange::whole()
3110 };
3111 
3112 static AlignParams test_ali_params_partial = {
3113  FA_NO_REPORT,
3114  false, // showGapsMessages
3115  PosRange(25, 60)
3116 };
3117 
3118 // ----------------------------------------
3119 
3120 static struct arb_unit_test::test_alignment_data TestAlignmentData_TargetAndReferenceHandling[] = {
3121  { 0, "s1", ".........A--UCU-C------C-U-AAACC-CA-A-C-C-G-UAG-UUC--------GAA-U-UGAGG-AC--U-GUAA-CU-C..........." }, // reference
3122  { 0, "s2", "AUCUCCUAAACCCAACCGUAGUUCGAAUUGAGGACUGUAACUC......................................................" }, // align single sequence (same data as reference)
3123  { 1, "m1", "UAGAGGAUUUGGGUUGGCAUCAAGCUUAACUCCUGACAUUGAG......................................................" }, // align marked sequences.. (complement of reference)
3124  { 1, "m2", "...UCCUAAACCAACCCGUAGUUCGAAUUGAGGACUGUAA........................................................." },
3125  { 1, "m3", "AUC---UAAACCAACCCGUAGUUCGAAUUGAGGACUG---CUC......................................................" },
3126  { 0, "c1", "AUCUCCUAAACCCAACC--------AAUUGAGGACUGUAACUC......................................................" }, // align selected sequences..
3127  { 0, "c2", "AUCUCCU------AACCGUAGUUCCCCGAA------ACUGUAACUC..................................................." },
3128  { 0, "r1", "GAGUUACAGUCCUCAAUUCGGGGAACUACGGUUGGGUUUAGGAGAU..................................................." }, // align by faked pt_server
3129 };
3130 
3131 void TEST_Aligner_TargetAndReferenceHandling() {
3132  // performs some alignments to check logic of target and reference handling
3133 
3134  GB_shell shell;
3135  ARB_ERROR error;
3136  GBDATA *gb_main = TEST_CREATE_DB(error, test_aliname, TestAlignmentData_TargetAndReferenceHandling, false);
3137 
3138  TEST_EXPECT_NULL(error.deliver());
3139 
3140  SearchRelativeParams search_relative_params(new FakeFamilyFinder(gb_main, test_aliname, false, 8),
3141  test_aliname,
3142  2);
3143 
3144  test_install_fakes(gb_main);
3145  arb_suppress_progress silence;
3146 
3147  // bool cont_on_err = true;
3148  bool cont_on_err = false;
3149 
3150  TEST_EXPECT_EQUAL(GBT_count_marked_species(gb_main), 3); // we got 3 marked species
3151  {
3152  Aligner aligner(gb_main,
3153  FA_CURRENT,
3154  test_aliname,
3155  "s2", // toalign
3156  NULp, // get_first_selected_species
3157  NULp, // get_next_selected_species
3158  "s1", // reference species
3159  NULp, // get_consensus
3160  search_relative_params, // relative search
3162  test_ali_params,
3163  0,
3164  cont_on_err,
3165  FA_NO_ACTION);
3166  error = aligner.run();
3167  TEST_EXPECT_NULL(error.deliver());
3168  }
3169  TEST_EXPECT_EQUAL(GBT_count_marked_species(gb_main), 3); // we still got 3 marked species
3170  {
3171  Aligner aligner(gb_main,
3172  FA_MARKED,
3173  test_aliname,
3174  NULp, // toalign
3175  NULp, // get_first_selected_species
3176  NULp, // get_next_selected_species
3177  "s1", // reference species
3178  NULp, // get_consensus
3179  search_relative_params, // relative search
3181  test_ali_params,
3182  0,
3183  cont_on_err,
3184  FA_MARK_FAILED);
3185  error = aligner.run();
3186  TEST_EXPECT_NULL(error.deliver());
3187 
3188  TEST_EXPECT(!cont_on_err || GBT_count_marked_species(gb_main) == 0); // FA_MARK_FAILED (none failed -> none marked)
3189  }
3190  {
3191  Aligner aligner(gb_main,
3192  FA_SELECTED,
3193  test_aliname,
3194  NULp, // toalign
3195  fake_first_selected, // get_first_selected_species
3196  fake_next_selected, // get_next_selected_species
3197  NULp, // reference species
3198  fake_get_consensus, // get_consensus
3199  search_relative_params, // relative search
3201  test_ali_params,
3202  0,
3203  cont_on_err,
3204  FA_MARK_ALIGNED);
3205  error = aligner.run();
3206  TEST_EXPECT_NULL(error.deliver());
3207 
3208  TEST_EXPECT(!cont_on_err || GBT_count_marked_species(gb_main) == 2); // FA_MARK_ALIGNED (2 selected were aligned)
3209  }
3210  {
3211  Aligner aligner(gb_main,
3212  FA_CURRENT,
3213  test_aliname,
3214  "r1", // toalign
3215  NULp, // get_first_selected_species
3216  NULp, // get_next_selected_species
3217  NULp, // reference species
3218  NULp, // get_consensus
3219  search_relative_params, // relative search
3221  test_ali_params,
3222  0,
3223  cont_on_err,
3224  FA_MARK_ALIGNED);
3225 
3226  error = aligner.run();
3227  TEST_EXPECT_NULL(error.deliver());
3228 
3229  TEST_EXPECT(!cont_on_err || GBT_count_marked_species(gb_main) == 1);
3230  }
3231 
3232  TEST_EXPECT_EQUAL(ALIGNED_DATA_OF("s2"), ".........A--UCU-C------C-U-AAACC-CA-A-C-C-G-UAG-UUC--------GAA-U-UGAGG-AC--U-GUAA-CU-C...........");
3233 
3234  TEST_EXPECT_EQUAL(ALIGNED_DATA_OF("m1"), ".......UAG--AGG-A------U-U-UGGGU-UG-G-C-A-U-CAA-GCU--------UAA-C-UCCUG-AC--A-UUGAG...............");
3235  TEST_EXPECT_EQUAL(ALIGNED_DATA_OF("m2"), "..............U-C------C-U-AAACC-AA-C-C-C-G-UAG-UUC--------GAA-U-UGAGG-AC--U-GUAA................");
3236  TEST_EXPECT_EQUAL(ALIGNED_DATA_OF("m3"), ".........A--U----------C-U-AAACC-AA-C-C-C-G-UAG-UUC--------GAA-U-UGAGG-AC--U-G----CU-C...........");
3237 
3238  TEST_EXPECT_EQUAL(ALIGNED_DATA_OF("c1"), ".........A--UCU-C------C-U-AAACC-CA-A-C-C-------------------AA-U-UGAGG-AC--U-GUAA-CU-C...........");
3239  TEST_EXPECT_EQUAL(ALIGNED_DATA_OF("c2"), ".........A--UCU-C------C-U-AA---------C-C-G-UAG-UUC------------C-CCGAA-AC--U-GUAA-CU-C...........");
3240 
3241  TEST_EXPECT_EQUAL(ALIGNED_DATA_OF("r1"), ".........A--UCU-C------C-U-AAACC-CA-A-C-C-G-UAG-UUCCCC-----GAA-U-UGAGG-AC--U-GUAA-CU-C..........."); // here sequence shall be turned!
3242 
3243 
3244  TEST_EXPECT_EQUAL(USED_RELS_FOR("r1"), "s2:43, s1:1");
3245 
3246  // ----------------------------------------------
3247  // now align all others vs next relative
3248 
3249  search_relative_params.maxRelatives = 5;
3250  TEST_EXPECT_NO_ERROR(forget_used_relatives(gb_main));
3251 
3252  int species_count = ARRAY_ELEMS(TestAlignmentData_TargetAndReferenceHandling);
3253  for (int sp = 0; sp<species_count; ++sp) {
3254  const char *name = TestAlignmentData_TargetAndReferenceHandling[sp].name;
3255  if (strcmp(name, "r1") != 0) { // skip "r1" (already done above)
3256  Aligner aligner(gb_main,
3257  FA_CURRENT,
3258  test_aliname,
3259  name, // toalign
3260  NULp, // get_first_selected_species
3261  NULp, // get_next_selected_species
3262  NULp, // reference species
3263  NULp, // get_consensus
3264  search_relative_params, // relative search
3266  test_ali_params,
3267  0,
3268  cont_on_err,
3269  FA_MARK_ALIGNED);
3270 
3271  error = aligner.run();
3272  TEST_EXPECT_NULL(error.deliver());
3273 
3274  TEST_EXPECT(!cont_on_err || GBT_count_marked_species(gb_main) == 1);
3275  }
3276  }
3277 
3278  TEST_EXPECT_EQUAL(USED_RELS_FOR("s1"), "s2");
3279  TEST_EXPECT_EQUAL(USED_RELS_FOR("s2"), "s1"); // same as done manually
3280  TEST_EXPECT_EQUAL(USED_RELS_FOR("m1"), "r1:42");
3281  TEST_EXPECT_EQUAL(USED_RELS_FOR("m2"), "m3");
3282  TEST_EXPECT_EQUAL(USED_RELS_FOR("m3"), "m2");
3283  TEST_EXPECT_EQUAL(USED_RELS_FOR("c1"), "r1");
3284  TEST_EXPECT_EQUAL(USED_RELS_FOR("c2"), "r1");
3285 
3286  // range aligned below (see test_ali_params_partial)
3287  // "-------------------------XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX------------------------------------"
3288  TEST_EXPECT_EQUAL(ALIGNED_DATA_OF("s1"), ".........A--UCU-C------C-U-AAACC-CA-A-C-C-G-UAG-UUC--------GAA-U-UGAGG-AC--U-GUAA-CU-C..........."); // 1st aligning of 's1'
3289  TEST_EXPECT_EQUAL(ALIGNED_DATA_OF("s2"), ".........A--UCU-C------C-U-AAACC-CA-A-C-C-G-UAG-UUC--------GAA-U-UGAGG-AC--U-GUAA-CU-C..........."); // same_as_above (again aligned vs 's1')
3290 
3291  TEST_EXPECT_EQUAL(ALIGNED_DATA_OF("m1"), ".........U--AGA-G------G---AUUUG-GG-U-U-G-G-CAU-CAAGCU-----UAA-C-UCCUG-AC--A-UUGAG---------------"); // changed; @@@ bug: no dots at end
3292 
3293  TEST_EXPECT_EQUAL(ALIGNED_DATA_OF("m2"), ".........U--C----------C-U-AAACC-AA-C-C-C-G-UAG-UUC--------GAA-U-UGAGG-AC--U-G----UA-A..........."); // changed (1st align vs 's1', this align vs 'm3')
3294  TEST_EXPECT_EQUAL(ALIGNED_DATA_OF("m3"), ".........A--U----------C-U-AAACC-AA-C-C-C-G-UAG-UUC--------GAA-U-UGAGG-AC--U-G----CU-C..........."); // same_as_above (1st align vs 's1', this align vs 'm2')
3295  TEST_EXPECT_EQUAL(ALIGNED_DATA_OF("c1"), ".........A--UCU-C------C-U-AAACC-CA-A-C-C-------------------AA-U-UGAGG-AC--U-GUAA-CU-C..........."); // same_as_above
3296  TEST_EXPECT_EQUAL(ALIGNED_DATA_OF("c2"), ".........A--UCU-C------C-U--------A-A-C-C-G-UAG-UUCCCC-----GA--------A-AC--U-GUAA-CU-C..........."); // changed
3297 
3298  // --------------------------------------
3299  // test partial relative search
3300 
3301  search_relative_params.getFamilyFinder()->restrict_2_region(test_ali_params_partial.range);
3302  TEST_EXPECT_NO_ERROR(forget_used_relatives(gb_main));
3303 
3304  for (int sp = 0; sp<species_count; ++sp) {
3305  const char *name = TestAlignmentData_TargetAndReferenceHandling[sp].name;
3306  Aligner aligner(gb_main,
3307  FA_CURRENT,
3308  test_aliname,
3309  name, // toalign
3310  NULp, // get_first_selected_species
3311  NULp, // get_next_selected_species
3312  NULp, // reference species
3313  NULp, // get_consensus
3314  search_relative_params, // relative search
3315  FA_TURN_NEVER,
3316  test_ali_params_partial,
3317  0,
3318  cont_on_err,
3319  FA_MARK_ALIGNED);
3320 
3321  error = aligner.run();
3322  TEST_EXPECT_NULL(error.deliver());
3323 
3324  TEST_EXPECT(!cont_on_err || GBT_count_marked_species(gb_main) == 1);
3325  }
3326 
3327  TEST_EXPECT_EQUAL(USED_RELS_FOR("s1"), "s2");
3328  TEST_EXPECT_EQUAL(USED_RELS_FOR("s2"), "s1");
3329  TEST_EXPECT_EQUAL(USED_RELS_FOR("m1"), "r1"); // (not really differs)
3330  TEST_EXPECT_EQUAL(USED_RELS_FOR("m2"), "m3");
3331  TEST_EXPECT_EQUAL(USED_RELS_FOR("m3"), "m2");
3332  TEST_EXPECT_EQUAL(USED_RELS_FOR("c1"), "r1");
3333  TEST_EXPECT_EQUAL(USED_RELS_FOR("c2"), "r1");
3334  TEST_EXPECT_EQUAL(USED_RELS_FOR("r1"), "s2:20, c2:3");
3335 
3336  // aligned range (see test_ali_params_partial)
3337  // "-------------------------XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX------------------------------------"
3338  TEST_EXPECT_EQUAL(ALIGNED_DATA_OF("s1"), ".........A--UCU-C------C-U-AAACC-CA-A-C-C-G-UAG-UUC--------GAA-U-UGAGG-AC--U-GUAA-CU-C..........."); // same_as_above
3339  TEST_EXPECT_EQUAL(ALIGNED_DATA_OF("s2"), ".........A--UCU-C------C-U-AAACC-CA-A-C-C-G-UAG-UUC--------GAA-U-UGAGG-AC--U-GUAA-CU-C..........."); // same_as_above
3340 
3341  TEST_EXPECT_EQUAL(ALIGNED_DATA_OF("m1"), ".........U--AGA-G------G-A-UU-UG-GG-U-U-G-G-CAU-CAAGCU-----UAA-C-UCCUG-AC--A-UUGAG---------------"); // changed
3342  TEST_EXPECT_EQUAL(ALIGNED_DATA_OF("m2"), ".........U--C----------C-U-AAACC-AA-C-C-C-G-UAG-UUC--------GAA-U-UGAGG-AC--U-G----UA-A..........."); // same_as_above
3343  TEST_EXPECT_EQUAL(ALIGNED_DATA_OF("m3"), ".........A--U----------C-U-AAACC-AA-C-C-C-G-UAG-UUC--------GAA-U-UGAGG-AC--U-G----CU-C..........."); // same_as_above
3344 
3345  TEST_EXPECT_EQUAL(ALIGNED_DATA_OF("c1"), ".........A--UCU-C------C-U-AAACC-CA-A-C-C-------------------AA-U-UGAGG-AC--U-GUAA-CU-C..........."); // same_as_above
3346  TEST_EXPECT_EQUAL(ALIGNED_DATA_OF("c2"), ".........A--UCU-C------C---------UA-A-C-C-G-UAG-UUCCCC-----GA--------A-AC--U-GUAA-CU-C..........."); // changed
3347 
3348  TEST_EXPECT_EQUAL(ALIGNED_DATA_OF("r1"), ".........A--UCU-C------C-U-AAACC-CA-A-C-C-G-UAG-UUCCCC-----GAA-U-UGAGG-AC--U-GUAA-CU-C..........."); // same_as_above
3349 
3350  GB_close(gb_main);
3351 }
3352 
3353 // ----------------------------------------
3354 
3355 static struct arb_unit_test::test_alignment_data TestAlignmentData_checksumError[] = {
3356  { 0, "MtnK1722", "...G-GGC-C-G............CCC-GG--------CAAUGGGGGCGGCCCGGCGGAC----GG--C-UCAGU-A---AAG-UCGUAACAA-GG-UAG-CCGU-AGGGGAA-CCUG-CGGC-UGGAUCACCUCC....." }, // gets aligned
3357  { 0, "MhnFormi", "...A-CGA-U-C------------CUUCGG--------GGUCG-U-GG-C-GU-A--C------GG--C-UCAGU-A---AAG-UCGUAACAA-GG-UAG-CCGU-AGGGGAA-CCUG-CGGC-UGGAUCACCUCCU...." }, // 1st relative
3358  { 0, "MhnT1916", "...A-CGA-A-C------------CUU-GU--------GUUCG-U-GG-C-GA-A--C------GG--C-UCAGU-A---AAG-UCGUAACAA-GG-UAG-CCGU-AGGGGAA-CCUG-CGGC-UGGAUCACCUCCU...." }, // next relative
3359  { 0, "MthVanni", "...U-GGU-U-U------------C-------------GGCCA-U-GG-C-GG-A--C------GG--C-UCAUU-A---AAG-UCGUAACAA-GG-UAG-CCGU-AGGGGAA-CCUG-CGGC-UGGAUCACCUCC....." }, // next relative
3360  { 0, "ThcCeler", "...G-GGG-C-G...CC-U---U--------GC--G--CGCAC-C-GG-C-GG-A--C------GG--C-UCAGU-A---AAG-UCGUAACAA-GG-UAG-CCGU-AGGGGAA-CCUA-CGGC-UCGAUCACCUCCU...." }, // next relative
3361 };
3362 
3363 void TEST_SLOW_Aligner_checksumError() {
3364  // @@@ SLOW cause this often gets terminated in nightly builds
3365  // no idea why. normally it runs a few ms
3366 
3367  // this produces an internal aligner error
3368 
3369  GB_shell shell;
3370  ARB_ERROR error;
3371  GBDATA *gb_main = TEST_CREATE_DB(error, test_aliname, TestAlignmentData_checksumError, false);
3372 
3373  SearchRelativeParams search_relative_params(new FakeFamilyFinder(gb_main, test_aliname, false, 8),
3374  test_aliname,
3375  10); // use up to 10 relatives
3376 
3377  test_install_fakes(gb_main);
3378  arb_suppress_progress silence;
3379 
3380  bool cont_on_err = true;
3381  if (!error) {
3382  Aligner aligner(gb_main,
3383  FA_CURRENT,
3384  test_aliname,
3385  "MtnK1722", // toalign
3386  NULp, // get_first_selected_species
3387  NULp, // get_next_selected_species
3388  NULp, // reference species
3389  NULp, // get_consensus
3390  search_relative_params, // relative search
3392  test_ali_params,
3393  0,
3394  cont_on_err,
3395  FA_MARK_ALIGNED);
3396 
3397  error = aligner.run();
3398  }
3399  {
3400  GB_ERROR err = error.deliver();
3401  TEST_EXPECT_NULL__BROKEN(err, "Aligner produced 1 error");
3402  }
3403  TEST_EXPECT_EQUAL__BROKEN(USED_RELS_FOR("MtnK1722"), "???", "<No such field 'used_rels'>"); // subsequent failure
3404 
3405  GB_close(gb_main);
3406 }
3407 
3408 static const char *asstr(LooseBases& ub) {
3409  LooseBases tmp;
3410  while (!ub.is_empty()) tmp.memorize(ub.recall());
3411 
3412  const char *result = "";
3413  while (!tmp.is_empty()) {
3414  ExplicitRange r = tmp.recall();
3415  result = GBS_global_string("%s %i/%i", result, r.start(), r.end());
3416  ub.memorize(r);
3417  }
3418  return result;
3419 }
3420 
3421 void TEST_BASIC_UnalignedBases() {
3422  LooseBases ub;
3423  TEST_EXPECT(ub.is_empty());
3424  TEST_EXPECT_EQUAL(asstr(ub), "");
3425 
3426  // test add+remove
3427  ub.memorize(ExplicitRange(5, 7));
3428  TEST_REJECT(ub.is_empty());
3429  TEST_EXPECT_EQUAL(asstr(ub), " 5/7");
3430 
3431  TEST_EXPECT(ub.recall() == ExplicitRange(5, 7));
3432  TEST_EXPECT(ub.is_empty());
3433 
3434  ub.memorize(ExplicitRange(2, 4));
3435  TEST_EXPECT_EQUAL(asstr(ub), " 2/4");
3436 
3437  ub.memorize(ExplicitRange(4, 9));
3438  TEST_EXPECT_EQUAL(asstr(ub), " 2/4 4/9");
3439 
3440  ub.memorize(ExplicitRange(8, 10));
3441  ub.memorize(ExplicitRange(11, 14));
3442  ub.memorize(ExplicitRange(12, 17));
3443  TEST_EXPECT_EQUAL(asstr(ub), " 2/4 4/9 8/10 11/14 12/17");
3444  TEST_EXPECT_EQUAL(asstr(ub), " 2/4 4/9 8/10 11/14 12/17"); // check asstr has no side-effect
3445 
3446  {
3447  LooseBases toaddNrecalc;
3448 
3449  CompactedSubSequence Old("ACGTACGT", 8, "name1");
3450  CompactedSubSequence New("--A-C--G-T--A-C-G-T", 19, "name2");
3451  // ---------------------- 0123456789012345678
3452 
3453  toaddNrecalc.memorize(ExplicitRange(1, 7));
3454  toaddNrecalc.memorize(ExplicitRange(3, 5));
3455  TEST_EXPECT_EQUAL(asstr(toaddNrecalc), " 1/7 3/5");
3456 
3457  ub.follow_ali_change_and_append(toaddNrecalc, AliChange(Old, New));
3458 
3459  TEST_EXPECT_EQUAL(asstr(ub), " 3/18 8/15 2/4 4/9 8/10 11/14 12/17");
3460  TEST_EXPECT(toaddNrecalc.is_empty());
3461 
3462  LooseBases selfRecalc;
3463  selfRecalc.follow_ali_change_and_append(ub, AliChange(New, New));
3464  TEST_EXPECT_EQUAL__BROKEN(asstr(selfRecalc),
3465  " 3/18 8/15 0/6 3/11 8/11 10/15 10/17", // wanted behavior?
3466  " 3/18 8/17 0/6 3/11 8/13 10/15 10/18"); // doc wrong behavior @@@ "8/17", "8/13", "10/18" are wrong
3467 
3468  ub.follow_ali_change_and_append(selfRecalc, AliChange(New, Old));
3469  TEST_EXPECT_EQUAL__BROKEN(asstr(ub),
3470  " 1/7 3/5 0/1 1/3 3/3 4/5 4/6", // wanted behavior? (from wanted behavior above)
3471  " 1/7 3/7 0/2 1/4 3/5 4/6 4/7"); // document wrong behavior
3472  TEST_EXPECT_EQUAL__BROKEN(asstr(ub),
3473  " 1/7 3/6 0/1 1/3 3/4 4/5 4/7", // wanted behavior? (from wrong result above)
3474  " 1/7 3/7 0/2 1/4 3/5 4/6 4/7"); // document wrong behavior
3475  }
3476 }
3477 
3478 
3479 #endif // UNIT_TESTS
3480 
3481 
GB_ERROR GB_begin_transaction(GBDATA *gbd)
Definition: arbdb.cxx:2516
#define GAP
#define FA_AWAR_SUBST_PARA_CT
void AWTC_create_common_next_neighbour_fields(AW_window *aws, int scaler_length)
const CompactedSubSequence & sequence() const
Definition: seq_search.hxx:265
int expdPosition(int cPos) const
Definition: seq_search.hxx:200
void restoreDots(CompactedSubSequence &slaveSequence)
Definition: seq_search.cxx:168
bool is_empty() const
const char * GB_ERROR
Definition: arb_core.h:25
long gaps() const
Definition: seq_search.hxx:414
#define QUALITY_NAME
void FastAligner_create_variables(AW_root *root, AW_default db1)
static LooseBases unaligned_bases
string result
bool is_whole() const
Definition: pos_range.h:68
char alignQuality(char slave, char master)
long free() const
Definition: seq_search.hxx:355
void GB_warning(const char *message)
Definition: arb_msg.cxx:484
SearchRelativeParams(FamilyFinder *ff_, const char *pt_server_alignment_, int maxRelatives_)
long expdPosition() const
Definition: seq_search.hxx:267
static int currentSequenceNumber
const FastAlignInsertion * insertion() const
Definition: seq_search.hxx:463
GBDATA * GBT_first_marked_species(GBDATA *gb_main)
Definition: aditem.cxx:113
#define CAN_SCORE_LEFT()
#define FA_AWAR_ESTIMATE_BASE_FREQ
virtual ARB_ERROR align_to(GBDATA *gb_toalign) const =0
void count_aligned_base(int mismatched)
Definition: seq_search.hxx:465
#define FA_AWAR_RELATIVE_RANGE
ARB_ERROR fast_align(const CompactedSubSequence &align_to, AlignBuffer *alignBuffer, int max_seq_length, int matchScore, int mismatchScore, FastAlignReport *report) const
GBDATA * GBT_first_marked_species_rel_species_data(GBDATA *gb_species_data)
Definition: aditem.cxx:109
#define FA_AWAR_GAP_A
bool may_refer_to_same_part_as(const CompactedSubSequence &other) const
Definition: seq_search.hxx:236
static ARB_ERROR alignToGroupConsensus(GBDATA *gb_toAlign, GB_CSTR alignment, Aligner_get_consensus_func get_consensus, int max_seq_length, const AlignParams &ali_params)
return string(buffer, length)
ARB_ERROR species_not_found(GB_CSTR species_name)
#define FA_AWAR_EXPECTED_DISTANCE
ARB_ERROR run()
GB_ERROR GB_write_string(GBDATA *gbd, const char *s)
Definition: arbdb.cxx:1385
void load_xfig(const char *file, bool resize=true)
Definition: AW_window.cxx:717
virtual GB_ERROR searchFamily(const char *sequence, FF_complement compl_mode, int max_results, double min_score)=0
FamilyFinder * getFamilyFinder()
#define ACID
PosRange intersection(PosRange r1, PosRange r2)
Definition: pos_range.h:98
#define FA_AWAR_AROUND
int length() const
Definition: seq_search.hxx:187
ARB_ERROR align_to(GBDATA *gb_toalign) const OVERRIDE
#define FA_AWAR_BASE_FREQ_A
int relatedBases(char base1, char base2)
void FastAligner_set_align_current(AW_root *root, AW_default db1)
Aligner_get_first_selected_species get_first_selected_species
#define FA_AWAR_TO_ALIGN
int start() const
Definition: pos_range.h:57
#define FA_AWAR_SUBST_PARA_AG
GB_ERROR GB_end_transaction(GBDATA *gbd, GB_ERROR error)
Definition: arbdb.cxx:2549
long
Definition: AW_awar.cxx:154
GB_CSTR get_alignment() const
void allow_title_reuse()
Definition: arb_progress.h:261
void insertGap(AlignBuffer *alignBuffer, SequencePosition &master, FastAlignReport *report)
const char * text() const
Definition: seq_search.hxx:350
#define INTEGRATED_ALIGNERS_TITLE
range_set::const_iterator iterator
Definition: RangeList.h:46
void AWT_insert_config_manager(AW_window *aww, AW_default default_file_, const char *id, const StoreConfigCallback &store_cb, const RestoreConfigCallback &load_or_reset_cb, const char *macro_id, const AWT_predefined_config *predef)
#define AWAR_DEFAULT_ALIGNMENT
Definition: aw_awar_defs.hxx:8
void GB_end_transaction_show_error(GBDATA *gbd, GB_ERROR error, void(*error_handler)(GB_ERROR))
Definition: arbdb.cxx:2572
bool is_part() const
Definition: pos_range.h:70
long read_int() const
Definition: AW_awar.cxx:187
static GBDATA * get_next_selected_species()
Definition: ED4_root.cxx:925
AW_awar * set_minmax(float min, float max)
Definition: AW_awar.cxx:532
#define GAP_CHARS
Definition: seq_search.hxx:45
#define FA_AWAR_SHOW_GAPS_MESSAGES
bool showGapsMessages
#define FA_AWAR_BASE_FREQ_T
FA_errorAction
const char * GBS_global_string(const char *templat,...)
Definition: arb_msg.cxx:204
void warning(int warning_num, const char *warning_message)
Definition: util.cxx:61
long GBT_get_alignment_len(GBDATA *gb_main, const char *aliname)
Definition: adali.cxx:718
bool GB_have_error()
Definition: arb_msg.cxx:349
FA_reference
void AW_POPDOWN(AW_window *window)
Definition: AW_window.cxx:52
ExplicitRange recall()
long length() const
Definition: seq_search.hxx:352
GBDATA * GBT_expect_SAI(GBDATA *gb_main, const char *name)
Definition: aditem.cxx:184
char * ARB_strpartdup(const char *start, const char *end)
Definition: arb_string.h:51
int size() const
Definition: pos_range.h:66
Aligner_get_next_selected_species get_next_selected_species
uint32_t GB_checksum(const char *seq, long length, int ignore_case, const char *exclude)
Definition: adstring.cxx:320
void set(char c, char q)
Definition: seq_search.hxx:363
void add(const PosRange &range)
Definition: RangeList.h:61
static PosRange whole()
Definition: pos_range.h:52
void awt_create_SAI_selection_button(GBDATA *gb_main, AW_window *aws, const char *varname, const SaiSelectionlistFilterCallback &fcb)
static void build_reverse_complement(AW_window *aw, const AlignDataAccess *data_access)
#define FA_AWAR_REFERENCE
static CompactedSubSequence * readCompactedSequence(GBDATA *gb_species, const char *ali, ARB_ERROR *errorPtr, char **dataPtr, long *seqChksum, PosRange range)
static int overallSequenceNumber
const char * name() const
Definition: seq_search.hxx:192
#define ARRAY_ELEMS(array)
Definition: arb_defs.h:19
#define FA_AWAR_CONTINUE_ON_ERROR
char buffer[MESSAGE_BUFFERSIZE]
Definition: seq_search.cxx:34
static ARB_ERROR insertClustalValigned(AlignBuffer *alignBuffer, SequencePosition &master, SequencePosition &slave, const char *masterAlignment, const char *slaveAlignment, long alignmentLength, FastAlignReport *report)
long rightOf() const
Definition: seq_search.hxx:280
GB_ERROR GB_push_transaction(GBDATA *gbd)
Definition: arbdb.cxx:2482
FILE * seq
Definition: rns.c:46
ARB_ERROR bufferTooSmall()
#define AWAR_NN_REL_SCALING
double log3(double d)
GB_ERROR GB_delete(GBDATA *&source)
Definition: arbdb.cxx:1904
#define AWAR_NN_FAST_MODE
long leftOf() const
Definition: seq_search.hxx:279
int follow_ali_change(const AliChange &change)
FamilyFinder * ff
#define GAP_CHAR
const char * read_char_pntr() const
Definition: AW_awar.cxx:171
NOT4PERL GBDATA * GBT_add_data(GBDATA *species, const char *ali_name, const char *key, GB_TYPES type) __ATTR__DEPRECATED_TODO("better use GBT_create_sequence_data()")
Definition: adali.cxx:571
char *(* Aligner_get_consensus_func)(const char *species_name, PosRange range)
size_t GB_read_string_count(GBDATA *gbd)
Definition: arbdb.cxx:910
GB_ERROR GB_await_error()
Definition: arb_msg.cxx:353
WindowCallback makeHelpCallback(const char *helpfile)
Definition: aw_window.hxx:106
#define TEST_EXPECT(cond)
Definition: test_unit.h:1313
long GB_read_count(GBDATA *gbd)
Definition: arbdb.cxx:752
#define AWAR_PT_SERVER
#define SCORE_LEFT()
static GB_alignment_type global_alignmentType
Definition: arbdb.h:78
#define INSERTS_NAME
GB_ERROR deliver() const
Definition: arb_error.h:114
void FastAligner_set_reference_species(AW_root *root)
static void nullcb()
TYPE * ARB_alloc(size_t nelem)
Definition: arb_mem.h:56
NOT4PERL void GBT_reverseComplementNucSequence(char *seq, long length, char T_or_U)
Definition: adRevCompl.cxx:102
static AW_window * create_family_settings_window(AW_root *root)
static ARB_ERROR alignCompactedTo(CompactedSubSequence *toAlignSequence, const FastSearchSequence *alignTo, int max_seq_length, GB_CSTR alignment, long toAlignChksum, GBDATA *gb_toAlign, GBDATA *gb_alignTo, const AlignParams &ali_params)
void restrict_2_region(const PosRange &range_)
AW_awar * awar_float(const char *var_name, float default_value=0.0, AW_default default_file=AW_ROOT_DEFAULT)
Definition: AW_root.cxx:560
#define FA_AWAR_SUBST_PARA_AT
GBDATA * gb_species_data
Definition: adname.cxx:34
static const char * read_name(GBDATA *gbd)
int GB_read_security_write(GBDATA *gbd)
Definition: arbdb.cxx:1570
#define FA_AWAR_PROTECTION
#define TEST_EXPECT_EQUAL__BROKEN(expr, want, got)
Definition: test_unit.h:1284
#define FA_AWAR_ACTION_ON_ERROR
int no_of_gaps_after(int cPos) const
Definition: seq_search.hxx:197
const int * gapsBefore(int offset=0) const
Definition: seq_search.hxx:208
void append(LooseBases &loose)
void FastAligner_start(AW_window *aw, const AlignDataAccess *data_access)
void copy(const char *s, char q, long len)
Definition: seq_search.hxx:357
bool GB_is_ancestor_of(GBDATA *gb_ancestor, GBDATA *gb_descendant)
Definition: arbdb.cxx:1742
ExplicitReference(GB_CSTR alignment_, const FastSearchSequence *targetSequence_, GBDATA *gb_alignTo_, int max_seq_length_, const AlignParams &ali_params_)
void message(char *errortext)
int get_max_seq_length() const
#define TEST_REJECT(cond)
Definition: test_unit.h:1315
FA_report
void(* refresh_display)()
GBDATA *(* Aligner_get_first_selected_species)(int *total_no_of_selected_species)
AliChange(const CompactedSubSequence &old_, const CompactedSubSequence &new_)
int min(int i, int j)
static void error(const char *msg)
Definition: mkptypes.cxx:96
GBDATA * GB_get_root(GBDATA *gbd)
Definition: arbdb.cxx:1738
bool uses_rel_matches() const
void insertBase(AlignBuffer *alignBuffer, SequencePosition &master, SequencePosition &slave, FastAlignReport *report)
GBDATA * GBT_next_marked_species(GBDATA *gb_species)
Definition: aditem.cxx:116
#define FA_AWAR_PT_SERVER_ALIGNMENT
iterator begin() const
Definition: RangeList.h:51
#define FA_AWAR_RANGE
#define CAN_SCORE_RIGHT()
ARB_ERROR ClustalV_align(int is_dna, int weighted, const char *seq1, int length1, const char *seq2, int length2, const int *gapsBefore1, int max_seq_length, const char *&res1, const char *&res2, int &reslen, int &score)
Definition: ClustalV.cxx:971
int follow_ali_change_and_append(LooseBases &loose, const AliChange &change)
const char * text() const
Definition: seq_search.hxx:189
iterator end() const
Definition: RangeList.h:52
Definition: align.cxx:35
GB_alignment_type GBT_get_alignment_type(GBDATA *gb_main, const char *aliname)
Definition: adali.cxx:752
#define AWAR_SPECIES_NAME
#define YOFF
void correctUnalignedPositions()
Definition: seq_search.cxx:124
void ClustalV_exit()
Definition: ClustalV.cxx:1049
char * getHelixString() const
#define cmp(h1, h2)
Definition: admap.cxx:50
Aligner_get_selected_range get_selected_range
#define FA_AWAR_GAP_C
FF_complement
bool is_gap(char c)
Definition: seq_search.hxx:48
void AWTC_create_common_next_neighbour_vars(AW_root *aw_root, const RootCallback &awar_changed_cb)
GBDATA * GBT_find_species_rel_species_data(GBDATA *gb_species_data, const char *name)
Definition: aditem.cxx:133
FA_turn
static AW_window * create_island_hopping_window(AW_root *root)
char * read_string() const
Definition: AW_awar.cxx:201
static WindowCallback simple(void(*root_cb)(AW_root *, T), T t)
Definition: rootAsWin.h:47
bool ARB_in_novice_mode(AW_root *awr)
Definition: aw_root.hxx:193
AW_awar * awar(const char *awar)
Definition: AW_root.cxx:554
GB_ERROR GB_pop_transaction(GBDATA *gbd)
Definition: arbdb.cxx:2512
#define FA_AWAR_USE_ISLAND_HOPPING
#define FA_AWAR_STRUCTURE_SUPPLEMENT
Definition: arbdb.h:86
int no_of_gaps_before(int cPos) const
Definition: seq_search.hxx:196
GBDATA * GBT_find_sequence(GBDATA *gb_species, const char *aliname)
Definition: adali.cxx:682
FamilyList * insertSortedBy_matches(FamilyList *other)
FamilyList * next
PosRange range
bool is_ali_gap(char c)
Definition: seq_search.hxx:47
int baseMatch(char c1, char c2)
Definition: ClustalV.cxx:249
static ARB_ERROR alignTo(GBDATA *gb_toAlign, GB_CSTR alignment, const FastSearchSequence *alignTo, GBDATA *gb_alignTo, int max_seq_length, const AlignParams &ali_params)
void awt_create_PTSERVER_selection_button(AW_window *aws, const char *varname)
void count_unaligned_base(int no_of_bases)
Definition: seq_search.hxx:466
long GBT_count_marked_species(GBDATA *gb_main)
Definition: aditem.cxx:353
float read_float() const
Definition: AW_awar.cxx:180
#define TEST_EXPECT_ZERO(cond)
Definition: test_unit.h:1074
GB_alignment_type
Definition: arbdb_base.h:61
void memorize(ExplicitRange range)
GB_ERROR write_as_string(const char *aw_string)
static ARB_ERROR cannot_fast_align(const CompactedSubSequence &master, long moffset, long mlength, const CompactedSubSequence &slaveSequence, long soffset, long slength, int max_seq_length, AlignBuffer *alignBuffer, FastAlignReport *report)
GB_ERROR GB_set_temporary(GBDATA *gbd) __ATTR__USERESULT
Definition: arbdb.cxx:2274
#define AWAR_EDITOR_ALIGNMENT
static void appendNameAndUsedBasePositions(char **toString, GBDATA *gb_species, int usedBasePositions)
int aw_question(const char *unique_id, const char *question, const char *buttons, bool sameSizeButtons, const char *helpfile)
Definition: AW_question.cxx:26
static AWT_config_mapping_def aligner_config_mapping[]
#define FA_AWAR_SAI_RANGE_NAME
AW_awar * awar_int(const char *var_name, long default_value=0, AW_default default_file=AW_ROOT_DEFAULT)
Definition: AW_root.cxx:580
#define TEST_EXPECT_NULL__BROKEN(n, got)
Definition: test_unit.h:1308
double partSignificance(long seq1len, long seq2len, long partlen)
#define FA_AWAR_MIRROR
#define TEST_EXPECT_NULL(n)
Definition: test_unit.h:1307
long offset() const
Definition: seq_search.hxx:413
void GB_write_flag(GBDATA *gbd, long flag)
Definition: arbdb.cxx:2761
SearchRelativesReference(SearchRelativeParams &relSearch_, int max_seq_length_, FA_turn turnAllowed_, GB_CSTR alignment_, const AlignParams &ali_params_)
AW_window * FastAligner_create_window(AW_root *root, const AlignDataAccess *data_access)
ARB_ERROR align_to(GBDATA *gb_toalign) const OVERRIDE
GB_ERROR GBT_write_string(GBDATA *gb_container, const char *fieldpath, const char *content)
Definition: adtools.cxx:451
#define OVERRIDE
Definition: cxxforward.h:93
#define FA_AWAR_SUBST_PARA_AC
#define FA_AWAR_REFERENCE_NAME
const char * text() const
Definition: seq_search.hxx:264
virtual ~AlignmentReference()
long insertsToNextBase(AlignBuffer *alignBuffer, const SequencePosition &master)
ARB_ERROR FastAligner_delete_temp_entries(GBDATA *gb_species, const char *alignment)
#define FA_AWAR_BASE_FREQ_G
static IslandHopping * island_hopper
fa_assert(chars< MESSAGE_BUFFERSIZE)
static ARB_ERROR align_error(ARB_ERROR olderr, GBDATA *gb_toAlign, GBDATA *gb_alignTo)
static ARB_ERROR insertAligned(AlignBuffer *alignBuffer, SequencePosition &master, SequencePosition &slave, long partLength, FastAlignReport *report)
Aligner(GBDATA *gb_main_, FA_alignTarget alignWhat_, GB_CSTR alignment_, GB_CSTR toalign_, Aligner_get_first_selected_species get_first_selected_species_, Aligner_get_next_selected_species get_next_selected_species_, GB_CSTR reference_, Aligner_get_consensus_func get_consensus_, SearchRelativeParams &relSearch_, FA_turn turnAllowed_, const AlignParams &ali_params_, int maxProtection_, bool continue_on_error_, FA_errorAction error_action_)
char * GB_read_string(GBDATA *gbd)
Definition: arbdb.cxx:903
FA_range
#define AWAR_CURSOR_POSITION_LOCAL
long offset() const
Definition: seq_search.hxx:354
static ARB_ERROR reverseComplement(GBDATA *gb_species, GB_CSTR ali, int max_protection)
ARB_ERROR align_to(GBDATA *gb_toalign) const OVERRIDE
const FastAlignInsertion * next() const
Definition: seq_search.hxx:412
GBDATA *(* Aligner_get_next_selected_species)(void)
GBDATA * GBT_first_species(GBDATA *gb_main)
Definition: aditem.cxx:124
static ARB_ERROR writeStringToAlignment(GBDATA *gb_species, GB_CSTR alignment, GB_CSTR data_name, GB_CSTR str, bool temporary)
RangeList build_RangeList_from_string(const char *SAI_data, const char *set_bytes, bool invert)
Definition: RangeList.cxx:32
const FamilyList * getFamilyList() const
#define FA_AWAR_REPORT
int follow(ExplicitRange &range) const
#define TEST_EXPECT_NO_ERROR(call)
Definition: test_unit.h:1107
void aw_message(const char *msg)
Definition: AW_status.cxx:932
void insertSlaveBases(AlignBuffer *alignBuffer, SequencePosition &slave, int length, FastAlignReport *report)
AW_root * get_root()
Definition: aw_window.hxx:348
#define SCORE_RIGHT()
const PosRange & get_TargetRange() const
GBDATA * GBT_next_species(GBDATA *gb_species)
Definition: aditem.cxx:128
#define NULp
Definition: cxxforward.h:97
GBDATA * GBT_find_species(GBDATA *gb_main, const char *name)
Definition: aditem.cxx:139
Aligner_get_consensus_func get_group_consensus
static GBDATA * get_first_selected_species(int *total_no_of_selected_species)
Definition: ED4_root.cxx:933
int end() const
Definition: pos_range.h:61
AlignmentReference(GB_CSTR alignment_, int max_seq_length_, const AlignParams &ali_params_)
#define FA_AWAR_THRESHOLD
GB_ERROR write_string(const char *aw_string)
#define offset(field)
Definition: GLwDrawA.c:73
char * GBT_get_default_alignment(GBDATA *gb_main)
Definition: adali.cxx:687
const char * GBT_get_name(GBDATA *gb_item)
Definition: aditem.cxx:450
FA_alignTarget
const CompactedSubSequence & sequence() const
Definition: seq_search.hxx:504
#define AWAR_NN_MISMATCHES
int GB_get_transaction_level(GBDATA *gbd)
Definition: arbdb.cxx:2578
void GBT_mark_all(GBDATA *gb_main, int flag)
Definition: aditem.cxx:295
#define XOFF
static ARB_ERROR alignToNextRelative(SearchRelativeParams &relSearch, int max_seq_length, FA_turn turnAllowed, GB_CSTR alignment, GBDATA *gb_toAlign, const AlignParams &ali_params)
GB_transaction ta(gb_var)
GB_CSTR GB_read_char_pntr(GBDATA *gbd)
Definition: arbdb.cxx:898
GBDATA * gb_main
Definition: adname.cxx:33
#define FA_AWAR_SAI_RANGE_CHARS
AW_awar * awar_string(const char *var_name, const char *default_value="", AW_default default_file=AW_ROOT_DEFAULT)
Definition: AW_root.cxx:570
void memorize_insertion(long offset, long gaps)
Definition: seq_search.hxx:445
GBDATA * GB_search(GBDATA *gbd, const char *fieldpath, GB_TYPES create)
Definition: adquery.cxx:531
GB_CSTR GBT_get_name_or_description(GBDATA *gb_item)
Definition: aditem.cxx:441
FA_report report
size_t length
#define FA_AWAR_USE_SECONDARY
long calcSequenceChecksum(const char *data, long length)
#define FA_AWAR_GAP_B
#define FA_AWAR_NEXT_RELATIVES
static int restart
Definition: arb_a2ps.c:264
const char * GB_CSTR
Definition: arbdb_base.h:25
int compPosition(int xPos) const
Definition: seq_search.hxx:205
#define FA_AWAR_SUBST_PARA_CG
#define AW_ROOT_DEFAULT
Definition: aw_base.hxx:106
#define TEST_EXPECT_EQUAL(expr, want)
Definition: test_unit.h:1283
GB_ERROR write_int(long aw_int)
const AlignParams & get_ali_params() const
ConsensusReference(GB_CSTR alignment_, Aligner_get_consensus_func get_consensus_, int max_seq_length_, const AlignParams &ali_params_)
void aw_message_if(GB_ERROR error)
Definition: aw_msg.hxx:21
char * GBS_global_string_copy(const char *templat,...)
Definition: arb_msg.cxx:195
void GB_close(GBDATA *gbd)
Definition: arbdb.cxx:649
const char * quality() const
Definition: seq_search.hxx:351
NOT4PERL GB_ERROR GBT_determine_T_or_U(GB_alignment_type alignment_type, char *T_or_U, const char *supposed_target)
Definition: adRevCompl.cxx:90
#define FA_AWAR_BASE_FREQ_C
#define AWAR_NN_OLIGO_LEN
void setDotsAtEOSequence()
Definition: seq_search.cxx:197
void copy_corresponding_part(char *dest, const char *source, size_t source_len) const
Definition: pos_range.cxx:15
#define MIN_ALIGNMENT_RANGE
#define AWAR_NN_REL_MATCHES
#define FA_AWAR_SUBST_PARA_GT
GBDATA * GBT_get_species_data(GBDATA *gb_main)
Definition: aditem.cxx:105
GB_write_int const char s
Definition: AW_awar.cxx:156