ARB
group_search.cxx
Go to the documentation of this file.
1 // ============================================================= //
2 // //
3 // File : group_search.cxx //
4 // Purpose : provides group search functionality //
5 // //
6 // Coded by Ralf Westram (coder@reallysoft.de) in April 2017 //
7 // http://www.arb-home.de/ //
8 // //
9 // ============================================================= //
10 
11 #include "group_search.h"
12 
13 #include <arb_strarray.h>
14 #include <arb_progress.h>
15 #include <arb_sort.h>
16 #include <arb_strbuf.h>
17 #include <arb_defs.h>
18 
19 #include <gb_aci_impl.h>
20 
21 #include <ad_cb.h>
22 #include <TreeNode.h>
23 
24 #include <map>
25 #include <stack>
26 #include <arb_misc.h>
27 #include <arb_msg_nospam.h>
28 
29 using namespace std;
30 
31 class GroupSearchTree;
32 
33 class GroupSearchRoot FINAL_TYPE : public TreeRoot {
34 public:
37  {}
38  ~GroupSearchRoot() FINAL_OVERRIDE { predelete(); }
39 
40  DEFINE_TREE_ROOT_ACCESSORS(GroupSearchRoot, GroupSearchTree);
41 
42  // TreeRoot interface
43  inline TreeNode *makeNode() const OVERRIDE;
44  inline void destroyNode(TreeNode *node) const OVERRIDE;
45 };
46 
47 class GroupSearchTree FINAL_TYPE : public TreeNode {
48  mutable Lazy<int,-1> size; // number of leafs (=zombies+species); -1 -> need update
49  mutable Lazy<int,-1> marked; // number of marked species; -1 -> need update
50  mutable Lazy<int,-1> zombies; // number of zombies
51  mutable LazyFloat<double> aid; // average ingroup distance
52 
53  enum UpdateWhat {
54  UPDATE_SIZE, // quick (update 'size' only)
55  UPDATE_MARKED, // slow (update all)
56  };
57 
58  void update_info(UpdateWhat what) const;
59  void calc_average_ingroup_distance(int group_size) const;
60  double weighted_branchlength_sum(int group_size) const;
61 
62  static GBDATA *gb_species_data;
63 
64 public:
65  GroupSearchTree(GroupSearchRoot *root) :
66  TreeNode(root)
67  {}
68 
69  DEFINE_TREE_RELATIVES_ACCESSORS(GroupSearchTree);
70 
71  static void set_species_data(GBDATA *gb_species_data_) { gb_species_data = gb_species_data_; }
72 
73  // TreeNode interface
74  unsigned get_leaf_count() const FINAL_OVERRIDE {
75  if (size.needs_eval()) update_info(UPDATE_SIZE);
76  return size;
77  }
78  void compute_tree() OVERRIDE {
79  gs_assert(0); // should be unused
80  }
81 
82  unsigned get_marked_count() const {
83  if (marked.needs_eval()) update_info(UPDATE_MARKED);
84  return marked;
85  }
86  unsigned get_zombie_count() const {
87  if (zombies.needs_eval()) update_info(UPDATE_MARKED);
88  return zombies;
89  }
90 
92  if (aid.needs_eval()) calc_average_ingroup_distance(get_leaf_count());
93  return aid;
94  }
95 };
96 
98 
99 inline TreeNode *GroupSearchRoot::makeNode() const { return new GroupSearchTree(const_cast<GroupSearchRoot*>(this)); }
100 inline void GroupSearchRoot::destroyNode(TreeNode *node) const { delete DOWNCAST(GroupSearchTree*,node); }
101 
102 void GroupSearchTree::update_info(UpdateWhat what) const {
103  if (is_leaf()) {
104  size = 1;
105  if (what == UPDATE_MARKED) {
107 
109  if (gb_species) {
110  marked = GB_read_flag(gb_species);
111  zombies = 0;
112  }
113  else {
114  marked = 0;
115  zombies = 1;
116  }
117  }
118  }
119  else {
120  switch (what) {
121  case UPDATE_MARKED:
122  marked = get_leftson()->get_marked_count() + get_rightson()->get_marked_count(); // triggers lazy-update (UPDATE_MARKED)
123  zombies = get_leftson()->get_zombie_count() + get_rightson()->get_zombie_count();
124  // fall-through
125  case UPDATE_SIZE:
126  size = get_leftson()->get_leaf_count() + get_rightson()->get_leaf_count(); // triggers lazy-update (UPDATE_SIZE)
127  break;
128  }
129  }
130 }
131 
133 
135  string name;
136  RefPtr<GBDATA> gb_tree;
137  long inner_nodes; // number of inner nodes in binary tree (i.e. ROOTED)
138  // (Note: corrupted trees in existing DBs sometimes contain zero nodes
139  // (caused by older bugs?))
140 
141  GroupSearchRootPtr troot; // (optional) loaded tree
142  string load_error;
143 
144  void load_tree() {
145  gs_assert(!tree_is_loaded() && !failed_to_load());
146  troot = new GroupSearchRoot;
147  TreeNode *rootNode = GBT_read_tree(GB_get_root(gb_tree), get_name(), &*troot);
148  gs_assert(implicated(rootNode, !rootNode->is_normal_group())); // otherwise parent caching will get confused
149  if (!rootNode) {
150  load_error = GB_await_error();
151  }
152  else {
153  gs_assert(rootNode == troot->get_root_node());
154  }
155  }
156 
157 public:
158  SearchedTree(const char *name_, GBDATA *gb_main) :
159  name(name_),
160  gb_tree(GBT_find_tree(gb_main, name_)),
161  inner_nodes(-1)
162  {
163  gs_assert(gb_tree);
164  GBDATA *gb_nnodes = GB_entry(gb_tree, "nnodes");
165  if (gb_nnodes) inner_nodes = GB_read_int(gb_nnodes); // see GBT_size_of_tree
166  }
167 
168  GBDATA *get_tree_data() { return gb_tree; }
169  const char *get_name() const { return name.c_str(); }
170 
171  int get_leaf_count() const { return inner_nodes+1; }
172  int get_edge_iteration_count() const { return ARB_edge::iteration_count(get_leaf_count()); }
173 
174  bool tree_is_loaded() const { return troot.isSet(); }
175  bool failed_to_load() const { return !load_error.empty(); }
176  const char *get_load_error() const {
177  gs_assert(failed_to_load());
178  return load_error.c_str();
179  }
180  GroupSearchRoot *get_tree_root() {
181  if (!tree_is_loaded()) load_tree();
182  return failed_to_load() ? NULp : &*troot;
183  }
184  void flush_loaded_tree() { troot.setNull(); }
185 };
186 
187 typedef vector<SearchedTree> SearchedTreeContainer;
188 typedef SearchedTreeContainer::iterator SearchedTreeIter;
189 
190 const char *FoundGroup::get_name() const {
191  GBDATA *gb_name = GB_search(gb_group, "group_name", GB_STRING);
192  return gb_name ? GB_read_char_pntr(gb_name) : NULp;
193 }
194 int FoundGroup::get_name_length() const {
195  GB_transaction ta(gb_group);
196  GBDATA *gb_name = GB_search(gb_group, "group_name", GB_STRING);
197  return GB_read_string_count(gb_name);
198 }
199 
201  return GB_get_father(gb_group);
202 }
203 
204 const char *FoundGroup::get_tree_name() const {
205  GBDATA *gb_tree = get_tree_data();
206  return gb_tree ? GB_read_key_pntr(gb_tree) : NULp;
207 }
208 
210  GBDATA *gb_tree = GB_get_father(gb_group);
211  int order = -1;
212  if (gb_tree) {
213  GBDATA *gb_order = GB_entry(gb_tree, "order");
214  if (gb_order) {
215  order = GB_read_int(gb_order);
216  }
217  }
218  return order;
219 }
220 
222  GB_ERROR error = NULp;
223  GB_transaction ta(gb_group);
224 
225  GBDATA *gb_gname = GB_entry(gb_group, "group_name");
226  gs_assert(gb_gname); // groups shall always have a name
227  if (gb_gname) error = GB_delete(gb_gname);
228 
229  if (!error) {
230  GBDATA *gb_grouped = GB_entry(gb_group, "grouped");
231  if (gb_grouped) error = GB_delete(gb_grouped);
232  }
233 
234  if (!error) {
235  bool keep_node = false;
236  GBQUARK qid = GB_find_existing_quark(gb_group, "id");
237  for (GBDATA *gb_child = GB_child(gb_group); gb_child && !keep_node; gb_child = GB_nextChild(gb_child)) {
238  if (GB_get_quark(gb_child) != qid) {
239  keep_node = true;
240  }
241  }
242  if (!keep_node) { // no child beside "id" left -> delete node
243  error = GB_delete(gb_group.pointer_ref());
244  }
245  }
246 
247  return error;
248 }
249 
250 ARB_ERROR FoundGroup::rename_by_ACI(const char *acisrt, const QueriedGroups& results, int hit_idx) {
252  GB_transaction ta(gb_group);
253 
254  GBDATA *gb_gname = GB_entry(gb_group, "group_name");
255  if (!gb_gname) {
256  gs_assert(0); // groups shall always have a name
257  error = "FATAL: unnamed group detected";
258  }
259  else {
260  char *old_name = GB_read_string(gb_gname);
261  char *new_name = GS_calc_resulting_groupname(gb_group, results, hit_idx, old_name, acisrt, error);
262 
263  if (!error && new_name[0]) { // if ACI produces empty result -> skip rename
264  error = GBT_write_group_name(gb_gname, new_name, true);
265  }
266 
267  free(new_name);
268  free(old_name);
269  }
270 
271  return error;
272 }
273 
274 inline bool group_is_folded(GBDATA *gb_group) {
275  if (!gb_group) return false;
276  GBDATA *gb_grouped = GB_entry(gb_group, "grouped");
277  return gb_grouped && GB_read_byte(gb_grouped) != 0;
278 }
279 inline ARB_ERROR group_set_folded(GBDATA *gb_group, bool folded) {
280  gs_assert(gb_group);
281 
283  GBDATA *gb_grouped = GB_entry(gb_group, "grouped");
284 
285  if (!gb_grouped && folded) {
286  gb_grouped = GB_create(gb_group, "grouped", GB_BYTE);
287  if (!gb_grouped) error = GB_await_error();
288  }
289  if (gb_grouped) {
290  gs_assert(!error);
291  error = GB_write_byte(gb_grouped, folded);
292  }
293 #if defined(ASSERTION_USED)
294  else gs_assert(!folded);
295 #endif
296  return error;
297 }
298 
300  return group_is_folded(get_overlap_group());
301 }
302 bool FoundGroup::is_folded() const {
303  return group_is_folded(gb_group);
304 }
305 
306 ARB_ERROR FoundGroup::set_folded(bool folded) {
307  return group_set_folded(gb_group, folded);
308 }
309 ARB_ERROR FoundGroup::set_overlap_folded(bool folded) {
310  return group_set_folded(get_overlap_group(), folded);
311 }
312 
314  GB_transaction ta(gb_group);
315 
317 
318  bool was_folded = is_folded();
319  bool knows_overlap = knows_details(); // may be false when called by fold_found_groups(); acceptable
320  bool overlap_was_folded = knows_overlap && overlap_is_folded();
321  bool want_folded = was_folded;
322 
323  switch (mode) {
324  case GFM_TOGGLE: want_folded = !(was_folded || overlap_was_folded); break;
325  case GFM_COLLAPSE: want_folded = true; break;
326  case GFM_EXPAND: want_folded = false; break;
327  default: error = "invalid collapse mode"; gs_assert(0); break;
328  }
329 
330  if (!error && want_folded != was_folded) {
331  error = set_folded(want_folded);
332  }
333  if (!error && want_folded != overlap_was_folded && knows_overlap && gb_overlap_group) {
334  error = set_overlap_folded(want_folded);
335  }
336 
337  return error;
338 }
339 
340 void ColumnWidths::track(int wName, int wReason, int nesting, int size, int marked, int clusID, double aid, bool keeled) {
341  seen_keeled = seen_keeled || keeled;
342 
343  // track max. width:
344  name = std::max(name, wName);
345  reason = std::max(reason, wReason);
346 
347  // track max. value:
348  max_nesting = std::max(max_nesting, nesting);
349  max_size = std::max(max_size, size);
350  max_marked = std::max(max_marked, marked);
351  max_marked_pc = std::max(max_marked_pc, percent(marked, size));
352  max_cluster_id = std::max(max_cluster_id, clusID);
353  max_aid = std::max(max_aid, int(aid));
354 }
356  gs_assert(knows_details());
357  widths.track(get_name_length(),
358  get_hit_reason().length(),
359  nesting,
360  size,
361  marked,
362  clusterID,
363  aid,
364  keeled);
365 }
366 
367 // ---------------------
368 // ParentCache
369 
370 class ParentCache : virtual Noncopyable {
371  typedef map<GBDATA*,GBDATA*> Cache;
372  Cache cache;
373 
374 public:
375  void defineParentOf(GBDATA *gb_child_group, GBDATA *gb_parent_group) {
376  // gb_parent_group may be NULp
377  gs_assert(gb_child_group);
378  cache[gb_child_group] = gb_parent_group;
379  }
380  GBDATA *lookupParent(GBDATA *gb_child_group) const {
381  Cache::const_iterator found = cache.find(gb_child_group);
382  return found == cache.end() ? NULp : found->second;
383  }
384 
385  void fix_deleted_groups(const GBDATAset& deleted_groups) {
386  ParentCache translate; // translation table: oldDelParent -> newExistingParent (or NULp at top-level)
387  for (GBDATAset::const_iterator del = deleted_groups.begin(); del != deleted_groups.end(); ++del) {
388  GBDATA *gb_remaining_father = lookupParent(*del);
389  if (gb_remaining_father) { // otherwise 'del' point to sth unkown (see comment in GroupSearchCommon)
390  while (gb_remaining_father) {
391  if (deleted_groups.find(gb_remaining_father) == deleted_groups.end()) {
392  break; // not deleted -> use as replacement
393  }
394  gb_remaining_father = lookupParent(gb_remaining_father);
395  }
396  translate.defineParentOf(*del, gb_remaining_father);
397  }
398  }
399 
400  // erase deleted nodes from cache
401  for (GBDATAset::const_iterator del = deleted_groups.begin(); del != deleted_groups.end(); ++del) {
402  cache.erase(*del);
403  }
404 
405  // translate remaining entries
406  for (Cache::iterator c = cache.begin(); c != cache.end(); ++c) {
407  GBDATA *gb_child = c->first;
408  GBDATA *gb_parent = c->second;
409  if (deleted_groups.find(gb_parent) != deleted_groups.end()) {
410  defineParentOf(gb_child, translate.lookupParent(gb_parent));
411  }
412  }
413  }
414 };
415 
416 // ---------------------------
417 // GroupSearchCommon
418 
419 #define TRIGGER_UPDATE_GROUP_RESULTS "/tmp/trigger/group_result_update"
420 
422  // controls and maintains validity of existing group-search-results
423 
424  typedef set<GroupSearch*> GroupSearchSet;
425 
426  GroupSearchSet searches; // all existing searches (normally only one)
427 
428  bool cbs_installed;
429  GBDATA *gb_trigger; // TRIGGER_UPDATE_GROUP_RESULTS (triggers ONCE for multiple DB changes)
430 
431  // The following two sets may also contain "node" entries from
432  // completely different parts of the DB -> do not make assumptions!
433  GBDATAset deleted_groups; // entries are "deleted", i.e. access is invalid! Only comparing pointers is defined!
434  GBDATAset modified_groups;
435 
436  ParentCache pcache;
437 
438  void add_callbacks(GBDATA *gb_main);
439  void remove_callbacks(GBDATA *gb_main);
440 
441  void trigger_group_search_update() { GB_touch(gb_trigger); }
442 
443 public:
445  cbs_installed(false),
446  gb_trigger(NULp)
447  {}
449  gs_assert(!cbs_installed);
450  }
451 
452  ParentCache& get_parent_cache() { return pcache; }
453 
454  void notify_deleted(GBDATA *gb_node) { deleted_groups.insert(gb_node); trigger_group_search_update(); }
455  void notify_modified(GBDATA *gb_node) { modified_groups.insert(gb_node); trigger_group_search_update(); }
456 
457  bool has_been_deleted(GBDATA *gb_node) { return deleted_groups.find(gb_node) != deleted_groups.end(); }
458  bool has_been_modified(GBDATA *gb_node) { return modified_groups.find(gb_node) != modified_groups.end(); }
459 
460  void add(GroupSearch *gs) {
461  if (empty()) {
462  GBDATA *gb_main = gs->get_gb_main();
463  add_callbacks(gb_main);
464  }
465  searches.insert(gs);
466  }
467  void remove(GroupSearch *gs) {
468  searches.erase(gs);
469  if (empty()) {
470  GBDATA *gb_main = gs->get_gb_main();
471  remove_callbacks(gb_main);
472  }
473  }
474  bool empty() const { return searches.empty(); }
475 
477  deleted_groups.clear();
478  modified_groups.clear();
479  }
481  return !(deleted_groups.empty() && modified_groups.empty());
482  }
483 
485  if (has_notifications()) {
486  pcache.fix_deleted_groups(deleted_groups);
487  for (GroupSearchSet::iterator gs = searches.begin(); gs != searches.end(); ++gs) {
488  GroupSearch *gr_search = *gs;
489  gr_search->refresh_results_after_DBchanges();
490  }
491  clear_notifications();
492  }
493  }
494 };
495 
497  bool mark_as_deleted = cbtype == GB_CB_DELETE;
498 
499  if (!mark_as_deleted) {
500  if (!GB_entry(gb_node, "group_name")) { // if group_name disappeared
501  mark_as_deleted = true;
502  }
503  }
504 
505  if (mark_as_deleted) {
506  common->notify_deleted(gb_node);
507  }
508  else {
509  common->notify_modified(gb_node);
510  }
511 }
512 static void group_name_changed_cb(GBDATA *gb_group_name, GroupSearchCommon *common) {
513  GBDATA *gb_node = GB_get_father(gb_group_name);
514  if (gb_node) {
515  common->notify_modified(gb_node);
516  }
517 }
518 static void result_update_cb(GBDATA*, GroupSearchCommon *common) {
519  // is called once after DB changes that might affect validity of group-search-results
520  common->refresh_all_results();
521 }
522 
523 void GroupSearchCommon::add_callbacks(GBDATA *gb_main) {
524  gs_assert(!cbs_installed);
525 
526  GB_transaction ta(gb_main);
527  gb_trigger = GB_search(gb_main, TRIGGER_UPDATE_GROUP_RESULTS, GB_INT);
528 
529  GB_ERROR error = GB_add_hierarchy_callback(gb_main, "node", GB_CB_CHANGED_OR_DELETED, makeDatabaseCallback(tree_node_deleted_cb, this));
530  if (!error) error = GB_add_hierarchy_callback(gb_main, "node/group_name", GB_CB_CHANGED, makeDatabaseCallback(group_name_changed_cb, this));
531  if (!error) error = GB_add_callback(gb_trigger, GB_CB_CHANGED, makeDatabaseCallback(result_update_cb, this));
532 
533  if (error) GBT_message(gb_main, GBS_global_string("Failed to bind callback (Reason: %s)", error));
534  else cbs_installed = true;
535 }
536 
537 void GroupSearchCommon::remove_callbacks(GBDATA *gb_main) {
538  if (cbs_installed) {
539  GB_transaction ta(gb_main);
540  GB_ERROR error = GB_remove_hierarchy_callback(gb_main, "node", GB_CB_CHANGED_OR_DELETED, makeDatabaseCallback(tree_node_deleted_cb, this));
541  if (!error) error = GB_remove_hierarchy_callback(gb_main, "node/group_name", GB_CB_CHANGED, makeDatabaseCallback(group_name_changed_cb, this));
542  GB_remove_callback(gb_trigger, GB_CB_CHANGED, makeDatabaseCallback(result_update_cb, this));
543 
544  if (error) GBT_message(gb_main, GBS_global_string("Failed to remove callback (Reason: %s)", error));
545  else cbs_installed = false;
546  }
547 }
548 
549 // ---------------------
550 // GroupSearch
551 
552 GroupSearchCommon *GroupSearch::common = NULp;
553 
554 GroupSearch::GroupSearch(GBDATA *gb_main_, const GroupSearchCallback& redisplay_results_cb) :
555  gb_main(gb_main_),
556  redisplay_cb(redisplay_results_cb),
557  sortedByOrder(false)
558 {
559  if (!common) common = new GroupSearchCommon;
560  common->add(this);
561 }
562 
564  common->remove(this);
565  if (common->empty()) {
566  delete common;
567  common = NULp;
568  }
569 }
570 
571 static void collect_searched_trees(GBDATA *gb_main, const TreeNameSet& trees_to_search, SearchedTreeContainer& searched_tree) {
572  ConstStrArray tree_names;
573  GBT_get_tree_names(tree_names, gb_main, false);
574 
575  {
576  bool search_all = trees_to_search.empty();
577  for (int t = 0; tree_names[t]; ++t) {
578  if (search_all || trees_to_search.find(tree_names[t]) != trees_to_search.end()) {
579  searched_tree.push_back(SearchedTree(tree_names[t], gb_main));
580  }
581  }
582  }
583 }
584 
585 class Candidate : public FoundGroup {
586  // candidate for a search result
587  // - able to retrieve values (have tree to examine)
589 
590 public:
591  Candidate(const FoundGroup& group_, GroupSearchTree *node_) :
592  FoundGroup(group_),
593  node(node_)
594  {}
595  Candidate(GBDATA *gb_group_, GroupSearchTree *node_) :
596  FoundGroup(gb_group_),
597  node(node_)
598  {}
599 
600  FoundGroup& get_group() { return *this; }
601  const FoundGroup& get_group() const { return *this; }
602 
603  GroupSearchTree *get_clade() { // return node where clade is shown (differs from get_node for keeled groups)
604  TreeNode *keeld = node->keelTarget();
605  return keeld ? DOWNCAST(GroupSearchTree*, keeld) : &*node;
606  }
607  const GroupSearchTree *get_clade() const {
608  return const_cast<Candidate*>(this)->get_clade();
609  }
610 
611  int get_keeledStateInfo() const { return node->keeledStateInfo(); }
612 
613  void inform_group(const GroupSearch& group_search, const string& hitReason) {
614  // retrieve/store all information needed later (e.g. for sorting):
615  hit_reason = hitReason;
616 
617  GroupSearchTree *clade = get_clade();
618 
619  if (nesting.needs_eval()) nesting = group_search.calc_nesting_level(get_pointer());
620  if (size.needs_eval()) size = clade->get_leaf_count();
621  if (marked.needs_eval()) marked = clade->get_marked_count();
622  if (aid.needs_eval()) aid = clade->get_average_ingroup_distance();
623 
624  if (keeled.needs_eval()) {
626 
627  // set info needed for clade-overlap
628  if (keeled) {
629  if (!clade->is_leaf() && clade->is_normal_group()) { // got overlap
630  gb_overlap_group = clade->gb_node;
632  }
633  }
634  else {
635  if (node->is_keeled_group()) { // got overlap
636  gb_overlap_group = node->father->gb_node;
638  }
639  }
640 
641  }
642 
644  }
645 };
646 
647 class TargetGroup: public QueryTarget, virtual Noncopyable {
648  // wrapper to use Candidate as QueryTarget
649  SmartPtr<Candidate> cand;
650 
651 public:
652  TargetGroup(GBDATA *gb_main_, const char *treename_) :
653  QueryTarget(gb_main_, treename_)
654  {}
656 
657  void aimTo(const Candidate& c) { cand = new Candidate(c); }
658  void unAim() { cand.setNull(); }
659 
660  const FoundGroup& get_group() const { gs_assert(cand.isSet()); return cand->get_group(); }
661  const GroupSearchTree *get_clade() const { gs_assert(cand.isSet() && cand->get_clade()); return cand->get_clade(); }
662 
663  const char *get_group_name() const { return get_group().get_name(); }
664  unsigned get_group_size() const { return get_clade()->get_leaf_count(); }
665  unsigned get_marked_count() const { return get_clade()->get_marked_count(); }
666  unsigned get_zombie_count() const { return get_clade()->get_zombie_count(); }
667  double get_average_ingroup_distance() const { return get_clade()->get_average_ingroup_distance(); }
668  int get_keeledStateInfo() const { gs_assert(cand.isSet()); return cand->get_keeledStateInfo(); }
669 
670  // virtual QueryTarget interface:
671  GBDATA *get_ACI_item() const { return get_group().get_pointer(); }
672 };
673 
674 typedef list<Candidate> CandidateList;
675 
676 #if defined(ASSERTION_USED)
677 inline bool isCorrectParent(TreeNode *node, GBDATA *gb_group, GBDATA *gb_parent_group) {
685  gs_assert(node && gb_group);
686 
687  TreeNode *pnode = node->find_parent_with_groupInfo(true);
688  if (pnode) {
689  if (node->gb_node == gb_group) { // = node is not keeled
690  gs_assert(node->is_normal_group());
691  return pnode->gb_node == gb_parent_group;
692  }
693 
694  gs_assert(node->is_keeled_group()); // node is keeled
695  gs_assert(pnode->keelTarget() == node); // pnode is node storing that keeled node
696  gs_assert(pnode->gb_node == gb_group); // groupdata is attached at pnode
697 
698  TreeNode *ppnode = pnode->find_parent_with_groupInfo(true); // continue with next parent
699  if (ppnode) {
700  return ppnode->gb_node == gb_parent_group;
701  }
702  }
703 #if defined(ASSERTION_USED)
704  else {
705  gs_assert(node->gb_node == gb_group);
706  }
707 #endif
708 
709  return gb_parent_group == NULp;
710 }
711 #endif
712 
713 double GroupSearchTree::weighted_branchlength_sum(int group_size) const {
714  int leafs = get_leaf_count();
715  double sum = father ? get_branchlength() * leafs * (group_size-leafs) : 0.0;
716 
717  if (!is_leaf()) {
718  sum += get_leftson()->weighted_branchlength_sum(group_size);
719  sum += get_rightson()->weighted_branchlength_sum(group_size);
720  }
721 
722  return sum;
723 }
724 
725 void GroupSearchTree::calc_average_ingroup_distance(int group_size) const {
726  long pairs = long(group_size)*(group_size-1)/2; // warning: int-overflow with SSURef_NR99_128_SILVA_07_09_16_opt.arb
727 
728  if (pairs) {
729  double wbranchsum = weighted_branchlength_sum(group_size);
730  aid = wbranchsum / pairs;
731 
732  gs_assert(aid>=0);
733  }
734  else {
735  aid = 0;
736  }
737 }
738 
740  typedef set< RefPtr<GBDATA> > ExistingHits;
741 
742  ExistingHits existing_hits;
743  if (mode & GSM_FORGET_EXISTING) forget_results(); // from last search
744  else {
745  for (FoundGroupCIter prev = found->begin(); prev != found->end(); ++prev) {
746  existing_hits.insert(prev->get_pointer());
747  }
748  }
749 
750  bool match_unlisted = mode&GSM_ADD;
751 
752  if (query_expr.isNull()) addQueryExpression(CO_OR, CT_NAME, CM_MATCH, "*"); // default
753 
754  if (mode&GSM_MISMATCH) {
755  query_expr->negate();
756  }
757 
758  GB_ERROR error = NULp;
759  {
760  GB_transaction ta(gb_main);
761  SearchedTreeContainer searched_tree;
762 
763  GroupSearchTree::set_species_data(GBT_get_species_data(gb_main));
764 
765  collect_searched_trees(gb_main, trees_to_search, searched_tree);
766 
767  // calc overall iteration count (for progress)
768  long overall_iter_count = 0;
769  for (SearchedTreeIter st = searched_tree.begin(); st != searched_tree.end(); ++st) { // LOOP_VECTORIZED[!<6.0]
770  overall_iter_count += st->get_edge_iteration_count();
771  }
772 
773  // iterate over all trees
774  arb_progress progress("Searching groups", overall_iter_count);
775 
776  bool load_failures = false;
777  for (SearchedTreeIter st = searched_tree.begin(); !error && st != searched_tree.end(); ++st) {
778  GroupSearchRoot *troot = st->get_tree_root();
779 
780  TargetGroup target_group(gb_main, st->get_name());
781 
782  if (!troot) {
783  GBT_message(gb_main, GBS_global_string("Tree skipped: %s", st->get_load_error()));
784  progress.inc_by(st->get_edge_iteration_count());
785  load_failures = true;
786  }
787  else {
788  CandidateList candidate;
789  {
790  // search candidate groups (and populate parent-group cache on-the-fly)
791 
792  GBDATA *gb_parent_group = NULp; // last traversed parent group
793  ParentCache& pcache = common->get_parent_cache();
794  ARB_edge start = rootEdge(troot);
795  ARB_edge e = start;
796 
797  do {
798  switch (e.get_type()) {
799  case ROOT_EDGE:
800  gb_parent_group = NULp;
801  // fall-through
802  case EDGE_TO_LEAF: { // descent (store parents; perform match)
803  TreeNode *node = e.dest();
804  // [Note: order of if-tests is important, when keeled and normal group fall to same location]
805  if (node->is_keeled_group()) {
806  TreeNode *parent = e.source();
807  gs_assert(parent == node->get_father());
808 
809  GBDATA *gb_group = parent->gb_node;
810  pcache.defineParentOf(gb_group, gb_parent_group);
811  gs_assert(isCorrectParent(node, gb_group, gb_parent_group));
812  gb_parent_group = gb_group;
813  }
814  if (!node->is_leaf() && node->has_group_info()) {
815  GBDATA *gb_group = node->gb_node;
816 
817  if (node->is_normal_group()) {
818  pcache.defineParentOf(gb_group, gb_parent_group);
819  gs_assert(isCorrectParent(node, gb_group, gb_parent_group));
820  gb_parent_group = gb_group;
821  }
822 
823  ExistingHits::iterator prev_hit = existing_hits.find(gb_group);
824 
825  bool was_listed = prev_hit != existing_hits.end();
826  bool test_match = !was_listed == match_unlisted;
827 
828  if (test_match) { // store candidates
829  candidate.push_back(Candidate(gb_group, DOWNCAST(GroupSearchTree*, node)));
830  }
831  }
832  break;
833  }
834  case EDGE_TO_ROOT: { // ascent (restore parents)
835  TreeNode *node = e.source();
836  // [Note: order of if-tests is important, when keeled and normal group fall to same location]
837  if (!node->is_leaf() && node->is_normal_group()) {
838  GBDATA *gb_group = node->gb_node;
839  gb_parent_group = pcache.lookupParent(gb_group); // restore parent group
840  gs_assert(isCorrectParent(node, gb_group, gb_parent_group));
841  }
842  if (node->is_keeled_group()) {
843  TreeNode *parent = e.dest();
844  gs_assert(parent == node->get_father());
845 
846  GBDATA *gb_group = parent->gb_node;
847  gb_parent_group = pcache.lookupParent(gb_group); // restore parent group
848  gs_assert(isCorrectParent(node, gb_group, gb_parent_group));
849  }
850  break;
851  }
852  }
853 
854  error = progress.inc_and_error_if_aborted();
855  e = e.next();
856  }
857  while (e != start && !error);
858  }
859 
860  // now run queries for all candidates:
861  bool was_listed = !match_unlisted;
862  for (CandidateList::iterator cand = candidate.begin(); !error && cand != candidate.end(); ++cand) {
863  target_group.aimTo(*cand);
864 
865  string hit_reason;
866  if (query_expr->matches(target_group, hit_reason)) {
867  if (!was_listed) {
868  found->add_candidate(*this, *cand, hit_reason);
869  }
870  }
871  else {
872  if (was_listed) {
873  ExistingHits::iterator prev_hit = existing_hits.find(cand->get_group().get_pointer());
874  gs_assert(prev_hit != existing_hits.end()); // internal logic error
875  existing_hits.erase(prev_hit);
876  }
877  }
878  }
879  target_group.unAim();
880  st->flush_loaded_tree();
881  }
882  }
883 
884  if (load_failures) {
885  // remove failed trees from 'searched_tree'
886  SearchedTreeContainer reduced;
887  for (unsigned t = 0; t<searched_tree.size(); ++t) {
888  if (!searched_tree[t].failed_to_load()) {
889  reduced.push_back(searched_tree[t]);
890  }
891  }
892  int failed_trees = searched_tree.size()-reduced.size();
893  GBT_message(gb_main, GBS_global_string("%i tree(s) failed to load (will operate on rest)", failed_trees));
894  swap(reduced, searched_tree);
895  }
896 
897  if (!match_unlisted && !error) { // keep only hits still listed in existing_hits
898  QueriedGroups *kept = new QueriedGroups;
899 
900  for (FoundGroupCIter prev = found->begin(); prev != found->end(); ++prev) {
901  if (existing_hits.find(prev->get_pointer()) != existing_hits.end()) {
902  kept->add_informed_group(*prev);
903  }
904  }
905  found = kept;
906  }
907  }
908 
909  if (dups.isSet() && !error) {
910  // if elements were kept from last search, they have an outdated clusterID -> reset
911  for (FoundGroupIter g = found->begin(); g != found->end(); ++g) g->forget_cluster_id();
912 
913  error = clusterDuplicates();
914  }
915 
916  if (error) {
917  GBT_message(gb_main, error);
918  found = new QueriedGroups; // clear results
919  }
920 
921  sortedByOrder = false;
922 }
923 
924 // -----------------------------------------
925 // code for dupe-cluster detection
926 
927 inline bool contains(const WordSet& ws, const string& w) { return ws.find(w) != ws.end(); }
928 inline bool contains(const WordSet& ws, const char *w) { string W(w); return contains(ws, W); }
929 
930 static void string2WordSet(const char *name, WordSet& words, const char *wordSeparators, const WordSet& ignored_words) {
931  char *namedup = strdup(name);
932 
933  gs_assert(wordSeparators);
934 
935  ConstStrArray w;
936  GBT_splitNdestroy_string(w, namedup, wordSeparators, SPLIT_DROPEMPTY);
937  for (int i = 0; w[i]; ++i) {
938  if (!contains(ignored_words, w[i])) words.insert(w[i]);
939  }
940 }
941 inline void string_to_lower(string& s) {
942  for (string::iterator c = s.begin(); c != s.end(); ++c) {
943  *c = tolower(*c);
944  }
945 }
946 
947 struct GroupInfo { // helper class for Clusterer::calc_matches
948  string name; // groupname (lowercase if constructed with sens==GB_IGNORE_CASE)
950  SmartPtr<WordSet> words; // single words (if groupname consists of multiple words and 'prep_wordwise' was true)
951 
952  GroupInfo(const FoundGroup& g, bool prep_wordwise, GB_CASE sens, const char *wordSeparators, const WordSet& ignored_words) :
953  name(g.get_name()),
954  tree(g.get_tree_data())
955  {
956  if (sens == GB_IGNORE_CASE) string_to_lower(name);
957 
958  if (prep_wordwise) {
959  words = new WordSet;
960  string2WordSet(name.c_str(), *words, wordSeparators, ignored_words);
961  }
962  }
963 
964  size_t get_word_count() const {
965  // may return zero (if group name only contains ignored words!)
966  return words.isNull() ? 1 : words->size();
967  }
968 };
969 typedef vector<GroupInfo> GroupInfoVec;
970 
973  GB_CASE sens;
974 
975  int min_words; // only used by DNC_WORDWISE
976  WordSet ignored_words; // only used by DNC_WORDWISE
977 
978  string wordSeparators;
979 
980 public:
982  type(exact),
983  sens(sens_),
984  min_words(1)
985  {
986  gs_assert(exact == DNC_WHOLENAME);
987  }
988 
989  DupNameCriterion(DupNameCriterionType wordwise, GB_CASE sens_, int min_words_, const WordSet& ignored_words_, const char *wordSeparators_) :
990  type(wordwise),
991  sens(sens_),
992  min_words(min_words_),
993  wordSeparators(wordSeparators_)
994  {
995  gs_assert(wordwise == DNC_WORDWISE);
996  gs_assert(min_words>0);
997 
998  for (WordSet::const_iterator wi = ignored_words_.begin(); wi != ignored_words_.end(); ++wi) {
999  string word = *wi;
1000  if (sens == GB_IGNORE_CASE) string_to_lower(word);
1001  ignored_words.insert(word);
1002  }
1003  }
1004 
1005  DupNameCriterionType get_name_type() const { return type; }
1006  bool wordwise_name_matching() const { return get_name_type() == DNC_WORDWISE; }
1007 
1008  GB_CASE get_sensitivity() const { return sens; }
1009  const char *get_word_separators() const { return wordSeparators.c_str(); }
1010 
1011  const WordSet& get_ignored_words() const { return ignored_words; }
1012 
1013  int get_min_wanted_words() const { return min_words; }
1014  void set_min_wanted_words(int words) { min_words = words; }
1015 
1016  int name_matches_wordwise(const GroupInfo& gi1, const GroupInfo& gi2) const {
1017  int max_possible_word_matches = min(gi1.get_word_count(), gi2.get_word_count());
1018  if (max_possible_word_matches<min_words) return false;
1019 
1020  if (gi1.words.isNull()) {
1021  if (gi2.words.isNull()) {
1022  gs_assert(min_words<=1);
1023  gs_assert(!contains(ignored_words, gi1.name));
1024  gs_assert(!contains(ignored_words, gi2.name));
1025  return gi1.name.compare(gi2.name) == 0;
1026  }
1027  return name_matches_wordwise(gi2, gi1);
1028  }
1029 
1030  if (gi2.words.isNull()) {
1031  gs_assert(min_words<=1);
1032  gs_assert(!contains(ignored_words, gi2.name));
1033  return contains(*gi1.words, gi2.name);
1034  }
1035 
1036  int matched_words = 0;
1037  for (WordSet::const_iterator wi = gi1.words->begin(); wi != gi1.words->end(); ++wi) {
1038  if (contains(*gi2.words, *wi)) ++matched_words;
1039  }
1040 
1041  return matched_words>=min_words ? matched_words : false;
1042  }
1043 
1044  int name_matches(const GroupInfo& gi1, const GroupInfo& gi2) const {
1045  return type == DNC_WHOLENAME
1046  ? gi1.name.compare(gi2.name) == 0
1047  : name_matches_wordwise(gi1, gi2);
1048  }
1049 };
1050 
1051 typedef set<int> GroupClusterSet;
1052 typedef GroupClusterSet::const_iterator GroupClusterCIter;
1053 
1055  GroupClusterSet members; // contains indices into Clusterer::groups
1056  int num_groups; // size of Clusterer::groups
1057 
1058  mutable vector<uint8_t> lookup; // when non-empty: contains true for members
1059 
1060  inline bool valid(int i) const { return i >= 0 && i<num_groups; }
1061  inline bool have_lookup() const { return !lookup.empty(); }
1062 
1063 public:
1064  GroupCluster(int num_of_groups)
1065  : num_groups(num_of_groups)
1066  {}
1068 
1069  GroupCluster(const GroupCluster& other) : // does NOT copy lookup table
1070  members(other.members),
1071  num_groups(other.num_groups)
1072  {}
1074 
1075  void allow_lookup() const { // create lookup table -> allows to run 'contains()'
1076  if (!have_lookup()) {
1077  lookup.resize(num_groups, int(false));
1078  for (GroupClusterCIter ci = begin(); ci != end(); ++ci) {
1079  lookup[*ci] = true;
1080  }
1081  gs_assert(have_lookup());
1082  }
1083  }
1084  void forget_lookup() const { lookup.clear(); }
1085 
1086  void clear() {
1087  if (have_lookup()) {
1088  for (GroupClusterCIter ci = begin(); ci != end(); ++ci) lookup[*ci] = false;
1089  }
1090  members.clear();
1091  }
1092 
1093  void insert(int i) {
1094  gs_assert(valid(i));
1095  members.insert(i);
1096  if (have_lookup()) lookup[i] = true;
1097  }
1098  void erase(int i) {
1099  gs_assert(valid(i));
1100  members.erase(i);
1101  if (have_lookup()) lookup[i] = false;
1102  }
1103 
1104  bool contains(int i) const {
1105  gs_assert(valid(i));
1106  gs_assert(have_lookup());
1107  return lookup[i];
1108  }
1109 
1110  bool empty() const { return members.empty(); }
1111  size_t size() const { return members.size(); }
1112 
1113  GroupClusterCIter begin() const { return members.begin(); }
1114  GroupClusterCIter end() const { return members.end(); }
1115 };
1116 
1117 
1119  bool listDups; // true->list duplicate groups; false->list "unique" groups (non-duplicate groups)
1120  DupTreeCriterionType ttype;
1121  int minSize; // minimum cluster size (for DLC_DIFF_TREE: minimum number of different trees per cluster)
1122 
1123 public:
1124  DupCriteria(bool listDups_, const DupNameCriterion& nameCrit_, DupTreeCriterionType ttype_, int minSize_) :
1125  DupNameCriterion(nameCrit_),
1126  listDups(listDups_),
1127  ttype(ttype_),
1128  minSize(minSize_)
1129  {
1130  gs_assert(minSize>=2);
1131  }
1132 
1133  DupTreeCriterionType get_tree_type() const { return ttype; }
1134  bool want_unique_groups() const { return !listDups; }
1135 
1136  bool is_inferable() const {
1137  // An inferable criteria has to allow the following deduction:
1138  // (A == B) and (B == C) -> (A == C)
1139  //
1140  // For comparing group names,
1141  // - whole name comparison is an inferable criteria
1142  // - wordwise comparison isnt!
1143 
1144  // Note: comparing trees for equality is inferable,
1145  // comparing trees for difference isnt.
1146 
1147  return !wordwise_name_matching();
1148  }
1149 
1150  bool tree_matches(const GBDATA *data1, const GBDATA *data2) const {
1151  bool did_match;
1152  switch (ttype) {
1153  case DLC_SAME_TREE:
1154  did_match = data1 == data2;
1155  break;
1156 
1157  case DLC_DIFF_TREE:
1158  did_match = data1 != data2;
1159  break;
1160 
1161  case DLC_ANYWHERE:
1162  did_match = true; // ignore tree membership
1163  break;
1164  }
1165  return did_match;
1166  }
1167 
1168  int min_cluster_size() const { return minSize; }
1169  bool big_enough(const GroupCluster& cluster) const { return !cluster.empty() && int(cluster.size())>=minSize; }
1170 };
1171 
1173  // maps matrix indices to linear indices and vv.
1174  //
1175  // For each x/y-pair of matrix indices the following assumptions are made:
1176  // - x!=y (i.e. never used)
1177  // - value(x,y)==value(y,x)
1178 
1179  int size; // matrix size (x and y)
1180  int lin_size;
1181 
1182  int *firstIndexOfRow;
1183  void init_firstIndexOfRow() {
1184  firstIndexOfRow[0] = 0;
1185  for (int y = 1; y<size; ++y) {
1186  firstIndexOfRow[y] = firstIndexOfRow[y-1]+(y-1);
1187  }
1188  }
1189 
1190 public:
1191  SymmetricMatrixMapper(int elements) :
1192  size(elements),
1193  lin_size(size*(size-1)/2),
1194  firstIndexOfRow(new int[size])
1195  {
1196  gs_assert(elements>=2); // smaller is useless
1197  init_firstIndexOfRow();
1198  }
1200  delete [] firstIndexOfRow;
1201  }
1202 
1203  int linear_size() const { return lin_size; }
1204  int linear_index(int x, int y) const {
1205  if (x>y) swap(x, y);
1206 
1207  gs_assert(x<y); // equal indices not allowed
1208  gs_assert(y<size);
1209  gs_assert(x>=0);
1210 
1211  return firstIndexOfRow[y]+x;
1212  }
1213 
1214 #if defined(UNIT_TESTS)
1215  void to_xy(int lin, int& x, int& y) const { // Note: only used in test-code
1216  for (y = 1; y<size && lin>=y; ++y) lin -= y; // if needed in production code: maybe use table for speedup
1217  x = lin;
1218  }
1219 #endif
1220 };
1221 
1222 class Clusterer {
1223  SmartPtr<QueriedGroups> groups;
1224  SmartPtr<DupCriteria> criteria;
1225  SymmetricMatrixMapper symmap;
1226 
1227  vector<uint8_t> name_matches;
1228  vector<bool> tree_matches;
1229 
1230  vector<uint8_t> words; // stores number of words for each group (indices into 'groups'; only valid when wordwise_name_matching)
1231 
1232  int next_id; // used for next cluster
1233  GroupCluster delivered; // stores indices (into 'groups') of all delivered groups
1234 
1235  int pairIdx(int i, int j) const { return symmap.linear_index(i, j); }
1236  void calc_matches(GBDATA *gb_main);
1237 
1238  int fits_into_cluster(int idx, const GroupCluster& cluster, bool strong_fit) const {
1239  const int min_words = criteria->get_min_wanted_words();
1240  bool enough_words = min_words<2 || words[idx] >= min_words;
1241 
1242  gs_assert(min_words>0);
1243 
1244  int fitting = 0;
1245  if (enough_words && !already_delivered(idx) && !cluster.contains(idx)) {
1246  bool fitsAll = true;
1247  bool weakFitAny = true;
1248 
1249  for (GroupClusterCIter ci = cluster.begin(); fitsAll && ci != cluster.end(); ++ci) {
1250  const int pi = pairIdx(idx, *ci);
1251  bool fitWeak = name_matches[pi] >= min_words;
1252 
1253  fitsAll = fitWeak && tree_matches[pi];
1254  weakFitAny = weakFitAny || fitWeak;
1255  }
1256 
1257  if (fitsAll) fitting = idx;
1258  else if (weakFitAny && !strong_fit) fitting = -idx;
1259  }
1260  return fitting;
1261  }
1262 
1263  int find_next_group_fitting_into(const GroupCluster& cluster, int behind_idx, bool strong_fit) const {
1264  // searches for the next group (with an index > 'behind_idx') fitting into 'cluster'.
1265  //
1266  // returns:
1267  // 0 = no such group found
1268  // >0 = index of first fitting group
1269  // <0 = index of candidate group (for cluster extension). not reported if 'strong_fit' is true
1270 
1271  gs_assert(!cluster.empty());
1272  gs_assert(behind_idx>=0);
1273 
1274  const int gcount = groups->size();
1275  int fitting = 0;
1276 
1277  for (int idx = behind_idx+1; idx<gcount && !fitting; ++idx) {
1278  fitting = fits_into_cluster(idx, cluster, strong_fit);
1279  }
1280 
1281  gs_assert(implicated(fitting>0, !cluster.contains(fitting)));
1282  gs_assert(implicated(strong_fit, fitting>=0));
1283 
1284  return fitting;
1285  }
1286 
1287  int find_next_candidate_group_fitting_into(const GroupCluster& cluster, const vector<int>& candidates, int& cand_idx, bool strong_fit) const {
1288  // similar to find_next_group_fitting_into(), but only considers indices listed in 'candidates' (instead of all)
1289  // (they can be retrieved using find_next_group_fitting_into before)
1290  //
1291  // additionally 'cand_idx' is set to the index corresponding with result
1292 
1293  gs_assert(!cluster.empty());
1294  gs_assert(cand_idx>=-1);
1295 
1296  const int cand_size = candidates.size();
1297  int fitting = 0;
1298 
1299  for (int cidx = cand_idx+1; cidx<cand_size; ++cidx) {
1300  int idx = candidates[cidx];
1301 
1302  fitting = fits_into_cluster(idx, cluster, strong_fit);
1303  if (fitting) {
1304  cand_idx = cidx;
1305  break;
1306  }
1307  }
1308 
1309  gs_assert(implicated(fitting>0, !cluster.contains(fitting)));
1310  gs_assert(implicated(strong_fit, fitting>=0));
1311 
1312  return fitting;
1313  }
1314 
1315  void extendClusterToBiggest(GroupCluster& curr, int next_idx, GroupCluster& best, arb_progress& progress_cluster, double done_low, double done_high);
1316 
1317 public:
1319  groups(groups_),
1320  criteria(criteria_),
1321  symmap(groups->size()),
1322  next_id(1),
1323  delivered(groups->size())
1324  {
1325  calc_matches(gb_main);
1326  }
1327 
1328  int max_cluster_start_index() const { return groups->size() - criteria->min_cluster_size(); }
1329 
1330  void buildInferableClusterStartingWith(int start_idx, GroupCluster& cluster);
1331  void findBestClusterBasedOnWords(int wanted_words, GroupCluster& best, arb_progress& progress_cluster, int& first_cluster_found_from_index);
1332 
1333  bool already_delivered(int idx) const { return delivered.contains(idx); }
1334  void deliverCluster(const GroupCluster& ofCluster, QueriedGroups& toResult) {
1335  int this_id = next_id++;
1336  for (GroupClusterCIter ci = ofCluster.begin(); ci != ofCluster.end(); ++ci) {
1337  int idx = *ci;
1338 
1339  // avoid duplication of groups in result list
1341  delivered.insert(idx);
1342 
1343  FoundGroup& g = (*groups)[idx];
1344  g.set_cluster_id(this_id);
1345  toResult.add_informed_group(g);
1346  }
1347  }
1348 
1349  void find_and_deliverTo(QueriedGroups& toResult);
1350  void deliverRest(QueriedGroups& toResult) {
1351  int idx = 0;
1352  for (FoundGroupCIter g = groups->begin(); g != groups->end(); ++g, ++idx) {
1353  if (!already_delivered(idx)) {
1354  toResult.add_informed_group(*g);
1355  }
1356  }
1357  }
1358 
1359  int calc_max_used_words(bool ignore_delivered) {
1360  gs_assert(criteria->wordwise_name_matching()); // otherwise words array contains nothing
1361 
1362  int maxWords = 0;
1363  const int maxidx = groups->size();
1364 
1365  for (int idx = 0; idx<maxidx; ++idx) {
1366  int thisWords = words[idx];
1367 
1368  if (thisWords>maxWords && (ignore_delivered ? !already_delivered(idx) : true)) {
1369  maxWords = thisWords;
1370  }
1371  }
1372 
1373  return maxWords;
1374  }
1375 
1376 };
1377 
1378 void Clusterer::calc_matches(GBDATA *gb_main) {
1379  const int gcount = groups->size();
1380  const int lin_range = symmap.linear_size();
1381  const long way_to_go = long(gcount) + lin_range;
1382 
1383  arb_progress progress(GBS_global_string("[pass 1/2: duplicity matrix (%s)]", GBS_readable_size(lin_range, "b")), way_to_go);
1384 
1385  name_matches.reserve(lin_range);
1386  tree_matches.reserve(lin_range);
1387 
1388  GroupInfoVec info;
1389  info.reserve(gcount);
1390 
1391  { // fetch info to speed up calculation below
1392  GB_transaction ta(gb_main);
1393 
1394  bool prep_wordwise = criteria->wordwise_name_matching();
1395  GB_CASE sens = criteria->get_sensitivity();
1396  const char *wordSeparators = criteria->get_word_separators();
1397  const WordSet& ignoredWords = criteria->get_ignored_words();
1398 
1399  for (FoundGroupCIter g = groups->begin(); g != groups->end() && !progress.aborted(); ++g) {
1400  info.push_back(GroupInfo(*g, prep_wordwise, sens, wordSeparators, ignoredWords));
1401  if (prep_wordwise) {
1402  const GroupInfo& ginfo = info.back();
1403  words.push_back(ginfo.get_word_count());
1404  }
1405  ++progress;
1406  }
1407  }
1408 
1409  for (int i1 = 0; i1<gcount && !progress.aborted(); ++i1) { // calculate pairwise group matches
1410  for (int i2 = i1+1; i2<gcount && !progress.aborted(); ++i2) {
1411  const int li = symmap.linear_index(i1, i2);
1412 
1413  name_matches[li] = criteria->name_matches(info[i1], info[i2]);
1414  tree_matches[li] = criteria->tree_matches(info[i1].tree, info[i2].tree);
1415 
1416  ++progress;
1417  }
1418  }
1419 }
1420 
1421 void Clusterer::buildInferableClusterStartingWith(const int start_idx, GroupCluster& cluster) {
1422  gs_assert(criteria->is_inferable()); // works only for inferable compare criteria
1423 
1424  int gcount = groups->size();
1425  arb_progress progress_build(long(gcount-start_idx-1));
1426 
1427  gs_assert(cluster.empty());
1428  gs_assert(!already_delivered(start_idx));
1429  cluster.insert(start_idx); // always add group at 'start_idx'
1430 
1431  GroupCluster weakCand(gcount); // collects non-strong, possible weak matches
1432 
1433  {
1434  int pcount = start_idx;
1435  int curr_idx = start_idx;
1436  while (!progress_build.aborted()) {
1437  const int addable = find_next_group_fitting_into(cluster, curr_idx, false);
1438  if (!addable) break;
1439 
1440  if (addable>0) { // found a strong match
1441  cluster.insert(addable);
1442  curr_idx = addable;
1443  }
1444  else {
1445  gs_assert(addable<0); // found a weak match
1446  weakCand.insert(-addable);
1447  curr_idx = -addable;
1448  }
1449 
1450  gs_assert(curr_idx>pcount);
1451  progress_build.inc_by(curr_idx-pcount);
1452  pcount = curr_idx;
1453  }
1454  }
1455 
1456  if (criteria->big_enough(cluster) && !progress_build.aborted()) {
1457  // extent cluster (by adding groups that match weak)
1458  // - e.g. add groups from same tree when searching for different trees
1459 
1460  if (!weakCand.empty()) {
1461  GroupCluster toAdd(gcount);
1462 
1463  if (criteria->get_tree_type() == DLC_DIFF_TREE) {
1464  for (GroupClusterCIter w = weakCand.begin(); w != weakCand.end(); ++w) {
1465  int nameFitsAll = true;
1466  for (GroupClusterCIter ci = cluster.begin(); nameFitsAll && ci != cluster.end(); ++ci) {
1467  int pi = pairIdx(*w, *ci);
1468  nameFitsAll = name_matches[pi];
1469  }
1470  if (nameFitsAll) toAdd.insert(*w);
1471  }
1472  }
1473  for (GroupClusterCIter a = toAdd.begin(); a != toAdd.end(); ++a) cluster.insert(*a);
1474  }
1475  }
1476  else { // forget if too small
1477  cluster.clear();
1478  }
1479 
1480  progress_build.done();
1481 
1482  gs_assert(contradicted(cluster.empty(), criteria->big_enough(cluster)));
1483 }
1484 
1485 inline unsigned long permutations(int elems) {
1486  return elems*elems/2-elems;
1487 }
1488 
1489 void Clusterer::extendClusterToBiggest(GroupCluster& curr, int next_idx, GroupCluster& best, arb_progress& progress_cluster, double done_low, double done_high) {
1490  // extends cluster 'curr' (using all possible combinations starting at 'next_idx' = index into 'groups')
1491  // stores best (=biggest) cluster in 'best'
1492 
1493  vector<int> candidates; // collect all possible groups
1494  {
1495  int idx = next_idx;
1496  while (1) {
1497  const int addable = find_next_group_fitting_into(curr, idx, true);
1498  if (!addable) break;
1499 
1500  candidates.push_back(addable);
1501  idx = addable;
1502  }
1503  }
1504 
1505  if ((candidates.size()+curr.size()) > best.size()) { // any chance to find bigger cluster?
1506  stack<int> previous; // previously added indices (into candidates)
1507  int curr_idx = -1; // last added (i.e. start with candidates[0])
1508 
1509  const int del_size = delivered.size();
1510  const unsigned long permutation_count = permutations(candidates.size());
1511 
1512  while (!progress_cluster.aborted()) {
1513  int addable = find_next_candidate_group_fitting_into(curr, candidates, curr_idx, true);
1514  gs_assert(addable>=0);
1515  if (addable) {
1516  curr.insert(addable);
1517  previous.push(curr_idx);
1518  }
1519  else {
1520  if (curr.size() > best.size() && criteria->big_enough(curr)) { // store 'curr' cluster if better
1521  best = curr;
1522 
1523  const unsigned long permutations_left = permutations(candidates.size()-best.size());
1524  const double done_percent = (permutation_count-permutations_left) / double(permutation_count);
1525  const double overall_done_percent = done_low + (done_high-done_low)*done_percent;
1526 
1527  progress_cluster.inc_to_avoid_overflow(del_size + best.size() * overall_done_percent); // @@@ calculation seems to be wrong (overflows)
1528  }
1529  if (previous.empty()) break; // end iteration
1530 
1531  const int last_cidx = previous.top();
1532  const int last_add = candidates[last_cidx];
1533 
1534  curr.erase(last_add);
1535  previous.pop();
1536  curr_idx = last_cidx;
1537 
1538  const int rest_cand = candidates.size() - (curr_idx+1);
1539  const size_t poss_size = rest_cand + curr.size();
1540  if (poss_size<best.size()) break; // end iteration (impossible to collect enough groups to form a bigger cluster)
1541  }
1542  }
1543 
1544  progress_cluster.inc_to_avoid_overflow(del_size + best.size() * done_high); // @@@ calculation seems to be wrong (overflows)
1545  }
1546 }
1547 
1548 void Clusterer::findBestClusterBasedOnWords(int wanted_words, GroupCluster& best, arb_progress& progress_cluster, int& first_cluster_found_from_index) {
1549  gs_assert(!criteria->is_inferable()); // thorough search not required
1550  gs_assert(best.empty());
1551 
1552  {
1553  const int old_min_words = criteria->get_min_wanted_words();
1554  criteria->set_min_wanted_words(wanted_words);
1555 
1556  const int gcount = groups->size();
1557  const int max_start_idx = gcount - criteria->min_cluster_size();
1558 
1559  GroupCluster curr(gcount);
1560  curr.allow_lookup();
1561 
1562  const int extension_count = 1+(wanted_words-1-old_min_words);
1563  const double done_per_extension = 1.0/extension_count;
1564 
1565  int first_index = 0;
1566 
1567  for (int start_idx = first_cluster_found_from_index; start_idx<max_start_idx && !progress_cluster.aborted(); ++start_idx) {
1568  if (words[start_idx]>=wanted_words && !already_delivered(start_idx)) {
1569  curr.clear();
1570  curr.insert(start_idx);
1571 
1572  extendClusterToBiggest(curr, start_idx, best, progress_cluster, 0.0, done_per_extension);
1573  if (!first_index && !best.empty()) {
1574  first_cluster_found_from_index = first_index = start_idx;
1575  }
1576  }
1577  }
1578 
1579  if (wanted_words>old_min_words && !best.empty() && !progress_cluster.aborted()) { // may less words be accepted?
1580  // extend cluster with "weaker" matches:
1581 
1582  int ext_done = 1;
1583  for (int fewer_words = wanted_words-1; fewer_words>=old_min_words && !progress_cluster.aborted(); --fewer_words, ++ext_done) {
1584  criteria->set_min_wanted_words(fewer_words);
1585 
1586  curr = best;
1587  curr.allow_lookup();
1588 
1589  const double done_start = ext_done*done_per_extension;
1590  extendClusterToBiggest(curr, 0, best, progress_cluster, done_start, done_start+done_per_extension);
1591  }
1592  }
1593 
1594  criteria->set_min_wanted_words(old_min_words);
1595  }
1596 
1597  gs_assert(contradicted(best.empty(), criteria->big_enough(best)));
1598 }
1599 
1600 
1602  int gcount = groups->size();
1603  GroupCluster curr(gcount);
1604 
1605  delivered.allow_lookup();
1606  curr.allow_lookup();
1607 
1608  if (criteria->is_inferable()) { // possible to use "fast" clustering?
1609  const int max_i = max_cluster_start_index();
1610  gs_assert(max_i>0);
1611 
1612  arb_progress progress_cluster("[pass 2/2: fast duplicate search]", long(max_i));
1613  for (int i = 0; i<max_i && !progress_cluster.aborted(); ++i) {
1614  if (!already_delivered(i)) {
1615  curr.clear();
1617  if (!curr.empty()) { // found a cluster
1618  deliverCluster(curr, toResult);
1619  }
1620  }
1621  ++progress_cluster;
1622  }
1623  }
1624  else { // use thorough cluster search
1625  int max_words = calc_max_used_words(true);
1626  const int min_words = criteria->get_min_wanted_words();
1627 
1628  long groups_with_min_words = 0;
1629  for (int gidx = 0; gidx<gcount; ++gidx) { // LOOP_VECTORIZED [!<5.0]
1630  if (words[gidx]>=min_words) ++groups_with_min_words;
1631  }
1632 
1633  arb_progress progress_cluster("[pass 2/2: thorough duplicate search]", groups_with_min_words);
1634 
1635  int first_cluster_found_from_index = 0;
1636  while (max_words >= min_words && !progress_cluster.aborted()) {
1637  curr.clear();
1638  findBestClusterBasedOnWords(max_words, curr, progress_cluster, first_cluster_found_from_index);
1639 
1640  if (curr.empty()) {
1641  --max_words;
1642  first_cluster_found_from_index = 0;
1643  }
1644  else {
1645  deliverCluster(curr, toResult);
1646  progress_cluster.inc_to(delivered.size());
1647  }
1648  }
1649  progress_cluster.done();
1650  }
1651 }
1652 
1653 GB_ERROR GroupSearch::clusterDuplicates() {
1654  GB_ERROR error = NULp;
1655  bool enough_hits = found->size()>=2;
1656 
1657  if (enough_hits) {
1658  arb_progress progress("Restricting to duplicate groups", 2L);
1659  Clusterer clusterer(gb_main, found, dups);
1660 
1661  if (clusterer.max_cluster_start_index()<0) {
1662  enough_hits = false; // e.g. 2 hits, but min. cluster-size==3
1663  progress.done();
1664  }
1665  else {
1666  found = new QueriedGroups; // clear result list
1667  clusterer.find_and_deliverTo(*found); // detect clusters of duplicates and add them to the result list
1668 
1669  if (dups->want_unique_groups() && !progress.aborted()) {
1670  QueriedGroups *nonDupGroups = new QueriedGroups;
1671 
1672  clusterer.deliverRest(*nonDupGroups);
1673  found = nonDupGroups;
1674  }
1675  }
1676 
1677  if (!error) error = progress.error_if_aborted();
1678  }
1679 
1680  if (!enough_hits && !error) {
1681  error = GBS_global_string("Not enough hits (%zu) to find duplicates", found->size());
1682  }
1683 
1684  return error;
1685 }
1686 
1688  if (found.isNull()) found = new QueriedGroups;
1689  if (!sortedByOrder) sort_results();
1690  return *found;
1691 }
1692 
1695  has_been_deleted(GroupSearchCommon *common_) : common(common_) {}
1696  bool operator()(const FoundGroup& g) { return common->has_been_deleted(g.get_pointer()); }
1697 };
1700  was_modified(GroupSearchCommon *common_) : common(common_) {}
1701  bool operator()(const FoundGroup& g) { return common->has_been_modified(g.get_pointer()); }
1702 };
1703 
1705  FoundGroupIter first_removed = remove_if(found.begin(), found.end(), has_been_deleted(common));
1706  bool erased = first_removed != found.end();
1707 
1708  found.erase(first_removed, found.end());
1709  invalidate_widths();
1710  return erased;
1711 }
1713  FoundGroupCIter modified = find_if(found.begin(), found.end(), was_modified(common));
1714  return modified != found.end();
1715 }
1716 
1719  compare_by_criteria(const SortCriteria& by_) : by(by_) {}
1720  bool operator()(const FoundGroup& g1, const FoundGroup& g2) const {
1721  int cmp = 0;
1722  bool last_was_modifier = false;
1723  bool reversed = false;
1724 
1725  SortCriteria::const_iterator crit = by.begin();
1726  while ((!cmp || last_was_modifier) && crit != by.end()) {
1727  last_was_modifier = (*crit == GSC_REVERSE);
1728  switch (*crit) {
1729  case GSC_NONE: gs_assert(0); break; // should not occur
1730  case GSC_REVERSE: reversed = !reversed; break;
1731 
1732  // alphabetically:
1733  case GSC_NAME: cmp = strcmp(g1.get_name(), g2.get_name()); break;
1734  case GSC_TREENAME: cmp = strcmp(g1.get_tree_name(), g2.get_tree_name()); break;
1735 
1736  case GSC_HIT_REASON: cmp = g1.get_hit_reason().compare(g2.get_hit_reason()); break;
1737 
1738  // small first:
1739  case GSC_TREEORDER: cmp = g1.get_tree_order() - g2.get_tree_order(); break;
1740  case GSC_NESTING: cmp = g1.get_nesting() - g2.get_nesting(); break;
1741  case GSC_CLUSTER: cmp = g1.get_cluster_id() - g2.get_cluster_id(); break;
1742  case GSC_AID: cmp = double_cmp(g1.get_aid(), g2.get_aid()); break;
1743 
1744  // big first:
1745  case GSC_SIZE: cmp = g2.get_size() - g1.get_size(); break;
1746  case GSC_MARKED: cmp = g2.get_marked() - g1.get_marked(); break;
1747  case GSC_MARKED_PC: cmp = g2.get_marked_pc() - g1.get_marked_pc(); break;
1748  case GSC_KEELED: cmp = g2.get_keeled() - g1.get_keeled(); break;
1749  }
1750  ++crit;
1751  }
1752  return reversed ? cmp>0 : cmp<0;
1753  }
1754 };
1755 
1757  stable_sort(found.begin(), found.end(), compare_by_criteria(by));
1758  sorted_by = &by;
1759 }
1760 
1761 void QueriedGroups::remove_hit(size_t idx) {
1762  if (idx<size()) {
1763  FoundGroupContainer::iterator del = found.begin();
1764  advance(del, idx);
1765  found.erase(del);
1766  invalidate_widths();
1767  }
1768 }
1769 
1771  if (widths.isNull()) {
1772  widths = new ColumnWidths;
1773  ColumnWidths& w = *widths;
1774  for (FoundGroupCIter g = begin(); g != end(); ++g) {
1775  g->track_max_widths(w);
1776  }
1777  }
1778  return *widths;
1779 }
1780 const char *QueriedGroups::get_group_display(const FoundGroup& g, bool show_tree_name) const {
1781  const ColumnWidths& width = get_column_widths(); // updates width information (if outdated)
1782 
1783  static GBS_strstruct display;
1784 
1785  display.erase();
1786 
1787  if (width.seen_keeled) display.put(g.get_keeled() ? KEELED_INDICATOR : ' ');
1788  display.nprintf(width.name+1, "%-*s", width.name, g.get_name()); // insert name as 1st column
1789 
1790  if (sorted_by) {
1791  // generate display-string depending on active SortCriteria:
1792  for (SortCriteria::const_iterator sc = sorted_by->begin(); sc != sorted_by->end(); ++sc) {
1793  switch (*sc) {
1794  case GSC_NONE: gs_assert(0); break; // invalid
1795 
1796  case GSC_TREENAME: // ignored (either already shown or only have one tree)
1797  case GSC_TREEORDER: // dito
1798  case GSC_REVERSE:
1799  case GSC_NAME:
1800  break; // ignored for display
1801 
1802  case GSC_HIT_REASON:
1803  display.nprintf(width.reason+1, " %-*s", width.reason, g.get_hit_reason().c_str());
1804  break;
1805 
1806  case GSC_NESTING: {
1807  int nesting_width = ColumnWidths::max2width(width.max_nesting);
1808  display.nprintf(nesting_width+1, " %*i", nesting_width, g.get_nesting());
1809  break;
1810  }
1811  case GSC_SIZE: {
1812  int size_width = ColumnWidths::max2width(width.max_size);
1813  display.nprintf(size_width+1, " %*i", size_width, g.get_size());
1814  break;
1815  }
1816  case GSC_MARKED: {
1817  int marked_width = ColumnWidths::max2width(width.max_marked);
1818  display.nprintf(marked_width+1, " %*i", marked_width, g.get_marked());
1819  break;
1820  }
1821  case GSC_MARKED_PC: {
1822  int marked_width = ColumnWidths::max2width(width.max_marked_pc);
1823  display.nprintf(marked_width+2, " %*i%%", marked_width, g.get_marked_pc());
1824  break;
1825  }
1826  case GSC_CLUSTER: {
1827  int cluster_width = ColumnWidths::max2width(width.max_cluster_id);
1828  display.nprintf(cluster_width+2, " %*ic", cluster_width, g.get_cluster_id());
1829  break;
1830  }
1831  case GSC_AID: {
1832  int aid_width = ColumnWidths::max2width(width.max_aid);
1833  display.nprintf(aid_width+6, " %*.4f", aid_width, g.get_aid());
1834  break;
1835  }
1836  case GSC_KEELED: {
1837  display.nprintf(2, " %i", g.get_keeled());
1838  break;
1839  }
1840  }
1841  }
1842  }
1843 
1844  if (show_tree_name) {
1845  display.put(' ');
1846  display.cat(g.get_tree_name());
1847  }
1848 
1849  return display.get_data();
1850 }
1851 
1852 void QueriedGroups::add_candidate(const GroupSearch& group_search, Candidate& cand, const std::string& hit_reason) {
1853  cand.inform_group(group_search, hit_reason);
1854  add_informed_group(cand.get_group());
1855 }
1856 
1857 
1859  if (!found.isNull() && !found->empty()) {
1860  bool erased = found->erase_deleted(common);
1861  bool changed = false;
1862  if (!erased) {
1863  changed = found->contains_changed(common);
1864  }
1865  if (erased || changed) {
1866  redisplay_cb(this);
1867  }
1868  }
1869 }
1870 
1876  if (gsc == GSC_NONE) {
1878  }
1879  else {
1880  bool add = true;
1881 
1882  if (!order.empty() && order.front() == gsc) {
1883  add = false;
1884  if (gsc == GSC_REVERSE) {
1885  order.pop_front(); // eliminate duplicate reverse
1886  sortedByOrder = false;
1887  }
1888  }
1889 
1890  if (add) {
1891  if (gsc != GSC_REVERSE) {
1892  // remove duplicated search criterion from order
1893  SortCriteria::iterator dup = find(order.begin(), order.end(), gsc);
1894  if (dup != order.end()) {
1895  SortCriteria::iterator pre = dup;
1896  do --pre; while (pre != order.end() && *pre == GSC_REVERSE);
1897 
1898  if (pre == order.end()) pre = order.begin(); // erase from start
1899  else ++pre; // step back to 1st GSC_REVERSE
1900 
1901  ++dup; // point behind duplicate
1902  order.erase(pre,dup);
1903  }
1904  }
1905 
1906  order.push_front(gsc);
1907  sortedByOrder = false;
1908  }
1909  }
1910 }
1911 
1912 void GroupSearch::sort_results() {
1913  if (!order.empty()) {
1914  GB_transaction ta(gb_main);
1915  found->sort_by(order);
1916  sortedByOrder = true;
1917  }
1918 }
1919 
1920 void GroupSearch::setDupCriteria(bool listDups, DupNameCriterionType ntype, GB_CASE sens, DupTreeCriterionType ttype, int min_cluster_size) {
1921  gs_assert(ntype != DNC_WORDWISE); // use flavor below
1922  dups = new DupCriteria(listDups, DupNameCriterion(ntype, sens), ttype, min_cluster_size);
1923 }
1924 void GroupSearch::setDupCriteria(bool listDups, DupNameCriterionType ntype, GB_CASE sens, int min_words, const WordSet& ignored_words, const char *wordSeparators, DupTreeCriterionType ttype, int min_cluster_size) {
1925  gs_assert(ntype == DNC_WORDWISE); // use flavor above
1926  dups = new DupCriteria(listDups, DupNameCriterion(ntype, sens, min_words, ignored_words, wordSeparators), ttype, min_cluster_size);
1927 }
1928 void GroupSearch::setDupCriteria(bool listDups, DupNameCriterionType ntype, GB_CASE sens, int min_words, const char *ignored_words, const char *wordSeparators, DupTreeCriterionType ttype, int min_cluster_size) {
1929  WordSet ignoredWordsSet;
1930  WordSet none; // no words ignored in ignoredWordsSet
1931  string2WordSet(ignored_words, ignoredWordsSet, wordSeparators, none);
1932  setDupCriteria(listDups, ntype, sens, min_words, ignoredWordsSet, wordSeparators, ttype, min_cluster_size);
1933 }
1934 
1935 
1937  dups.setNull();
1938 }
1939 
1941  if (idx<found->size()) return (*found)[idx].delete_from_DB();
1942  return "index out-of-bounds";
1943 }
1944 
1946  GB_ERROR error = NULp; // @@@ use ARB_ERROR instead (whole module + callers)
1947  if (has_results()) {
1948  GB_transaction ta(gb_main);
1949 
1950  for (FoundGroupIter group = found->begin(); !error && group != found->end(); ++group) {
1951  error = group->delete_from_DB();
1952  }
1953  error = ta.close(error);
1954  }
1955  return error;
1956 }
1957 
1958 // ------------------------------------------
1959 // ACI extension for group renaming
1960 
1961 using namespace GBL_IMPL;
1962 
1965  int hit_idx;
1966 
1967  GroupRename_callenv(const QueriedGroups& queried_, int hit_idx_, const GBL_env& env_) :
1968  GBL_call_env(NULp, env_),
1969  queried(queried_),
1970  hit_idx(hit_idx_)
1971  {}
1972 
1973  bool legal_hit_index() const { return hit_idx>=0 && unsigned(hit_idx)<queried.size(); }
1974 
1975  const FoundGroup *get_hit_group() const {
1976  if (legal_hit_index()) return &queried[hit_idx];
1977  return NULp;
1978  }
1979 
1980  int get_dupidx(GB_ERROR& error) const {
1981  const FoundGroup *group = get_hit_group();
1982  if (!group) {
1983  error = "no hit";
1984  return -1;
1985  }
1986 
1987  int cluster = group->get_cluster_id();
1988  if (cluster == 0) {
1989  error = "no duplicate";
1990  return -1;
1991  }
1992 
1993  int dupidx = 0;
1994 
1995  for (FoundGroupCIter g = queried.begin(); g != queried.end(); ++g) {
1996  if (&*g == group) return dupidx;
1997  if (g->get_cluster_id() == cluster) dupidx++;
1998  }
1999 
2000  gs_assert(0); // something went wrong
2001  error = "unknown error";
2002  return -1;
2003  }
2004 
2005 };
2006 
2008  return DOWNCAST_REFERENCE(const GroupRename_callenv, args->get_callEnv());
2009 }
2010 
2013  GB_ERROR error = check_no_parameter(args);
2014  if (!error) {
2015  const GroupRename_callenv& callEnv = custom_gr_env(args);
2016  if (callEnv.legal_hit_index()) {
2017  FORMAT_2_OUT(args, "%i", info2bio(callEnv.hit_idx));
2018  }
2019  else {
2020  error = "no hit";
2021  }
2022  }
2023 
2024  return error;
2025 }
2026 
2029  GB_ERROR error = check_no_parameter(args);
2030  if (!error) {
2031  const GroupRename_callenv& callEnv = custom_gr_env(args);
2032 
2033  const int dupidx = callEnv.get_dupidx(error);
2034  if (!error) {
2035  gs_assert(dupidx>=0);
2036  FORMAT_2_OUT(args, "%i", info2bio(dupidx));
2037  }
2038  }
2039 
2040  return error;
2041 }
2042 
2043 
2046  GB_ERROR error = check_no_parameter(args);
2047  if (!error) {
2048  const GroupRename_callenv& callEnv = custom_gr_env(args);
2049  FORMAT_2_OUT(args, "%zu", callEnv.queried.size());
2050  }
2051  return error;
2052 }
2055  GB_ERROR error = check_no_parameter(args);
2056  if (!error) {
2057  const FoundGroup *hit = custom_gr_env(args).get_hit_group();
2058  if (hit) {
2059  FORMAT_2_OUT(args, "%i", hit->get_size());
2060  }
2061  else {
2062  error = "no hit";
2063  }
2064  }
2065  return error;
2066 }
2069  GB_ERROR error = check_no_parameter(args);
2070  if (!error) {
2071  const FoundGroup *hit = custom_gr_env(args).get_hit_group();
2072  if (hit) {
2073  FORMAT_2_OUT(args, "%i", hit->get_marked());
2074  }
2075  else {
2076  error = "no hit";
2077  }
2078  }
2079  return error;
2080 }
2083  GB_ERROR error = check_no_parameter(args);
2084  if (!error) {
2085  const FoundGroup *hit = custom_gr_env(args).get_hit_group();
2086  if (hit) {
2087  FORMAT_2_OUT(args, "%f", hit->get_aid());
2088  }
2089  else {
2090  error = "no hit";
2091  }
2092  }
2093  return error;
2094 }
2097  GB_ERROR error = check_no_parameter(args);
2098  if (!error) {
2099  const FoundGroup *hit = custom_gr_env(args).get_hit_group();
2100  if (hit) {
2101  FORMAT_2_OUT(args, "%i", hit->get_nesting());
2102  }
2103  else {
2104  error = "no hit";
2105  }
2106  }
2107  return error;
2108 }
2109 
2110 
2112  { "hitidx", grl_hitidx },
2113  { "dupidx", grl_dupidx },
2114  { "hitcount", grl_hitcount },
2115  { "groupSize", grl_groupsize },
2116  { "markedInGroup", grl_markedingroup },
2117  { "aid", grl_aid },
2118  { "nesting", grl_nesting },
2119 
2120  { NULp, NULp }
2121 };
2122 
2124  static GBL_custom_command_lookup_table clt(groupRename_command_table,
2125  ARRAY_ELEMS(groupRename_command_table)-1,
2127  return clt;
2128 }
2129 
2130 char *GS_calc_resulting_groupname(GBDATA *gb_main, const QueriedGroups& queried, int hit_idx, const char *input_name, const char *acisrt, ARB_ERROR& error) {
2131  char *result = NULp;
2132  if (!input_name || !input_name[0]) {
2133  error = "Error: empty input groupname";
2134  }
2135  else {
2136  GB_transaction ta(gb_main);
2137  bool know_hit = hit_idx>=0 && unsigned(hit_idx)<queried.size();
2138  const FoundGroup *hit = know_hit ? &queried[hit_idx] : NULp;
2139 
2140  GBL_env env(gb_main, hit ? hit->get_tree_name() : NULp, get_GroupRename_customized_ACI_commands());
2141  GroupRename_callenv callEnv(queried, hit_idx, env);
2142 
2143  result = GB_command_interpreter_in_env(input_name, acisrt, callEnv);
2144  if (!result) {
2145  error = GBS_global_string("Error: %s", GB_await_error());
2146  }
2147  else {
2148  freeset(result, GBS_trim(result)); // trim whitespace
2149  }
2150  }
2151  return result;
2152 }
2153 
2154 ARB_ERROR GroupSearch::rename_group(size_t idx, const char *acisrt) {
2155  if (idx<found->size()) {
2156  return (*found)[idx].rename_by_ACI(acisrt, *found, idx);
2157  }
2158  return "index out-of-bounds";
2159 }
2160 
2162  ARB_ERROR error;
2163  if (has_results()) {
2164  GB_transaction ta(gb_main);
2165 
2166  MessageSpamFilter suppress("problematic group names");
2167 
2168  int idx = 0;
2169  for (FoundGroupIter group = found->begin(); !error && group != found->end(); ++group, ++idx) {
2170  error = group->rename_by_ACI(acisrt, *found, idx);
2171  }
2172  error = ta.close(error);
2173  }
2174  return error;
2175 }
2176 
2178  if (idx<found->size()) {
2179  return (*found)[idx].change_folding(mode);
2180  }
2181  return "index out-of-bounds";
2182 }
2183 
2185  // works for groups which are members of one of the searched tree
2186  return common->get_parent_cache().lookupParent(gb_group);
2187 }
2188 
2190  int nesting = 0;
2191  while (gb_group) {
2192  gb_group = get_parent_group(gb_group);
2193  if (gb_group) ++nesting;
2194  }
2195  return nesting;
2196 }
2197 
2198 
2200  ARB_ERROR error;
2201  GB_transaction ta(gb_main);
2202 
2203  GBDATAset modifiedTrees;
2204 
2205  // create a set of affected groups
2206  GBDATAset targetGroups;
2207  for (FoundGroupCIter g = found->begin(); g != found->end(); ++g) {
2208  GBDATA *gb_group = g->get_pointer();
2209  targetGroups.insert(gb_group);
2210  }
2211 
2212  if (mode & GFM_RECURSE) { // also operate on parents
2213  GBDATAset testParentsOf = targetGroups;
2214  if (mode & GFM_PARENTS_ONLY) targetGroups.clear();
2215  while (!testParentsOf.empty()) { // redo until no more parents get added
2216  GBDATAset addedParents;
2217  for (GBDATAset::iterator t = testParentsOf.begin(); t != testParentsOf.end(); ++t) {
2218  GBDATA *gb_parent_group = get_parent_group(*t);
2219  if (gb_parent_group && targetGroups.find(gb_parent_group) == targetGroups.end()) {
2220  addedParents.insert(gb_parent_group);
2221  targetGroups.insert(gb_parent_group);
2222  }
2223  }
2224  testParentsOf = addedParents;
2225  }
2226  }
2227 
2229  for (GBDATAset::iterator n = targetGroups.begin(); n != targetGroups.end() && !error; ++n) {
2230  error = FoundGroup(*n).change_folding(basicMode);
2231  }
2232 
2233  if (!error && (mode & GFM_COLLAPSE_REST)) { // collapse everything else
2234  SearchedTreeContainer searched_tree;
2235  collect_searched_trees(gb_main, trees_to_search, searched_tree);
2236 
2237  for (SearchedTreeIter t = searched_tree.begin(); t != searched_tree.end() && !error; ++t) {
2238  GBDATA *gb_tree_data = t->get_tree_data();
2239  for (GBDATA *gb_node = GB_entry(gb_tree_data, "node"); gb_node && !error; gb_node = GB_nextEntry(gb_node)) {
2240  GBDATA *gb_name = GB_entry(gb_node, "group_name");
2241  if (gb_name) { // named node (aka group)
2242  if (targetGroups.find(gb_node) == targetGroups.end()) { // not already handled before
2243  error = FoundGroup(gb_node).change_folding(GFM_COLLAPSE);
2244  }
2245  }
2246  }
2247  }
2248  }
2249 
2250  return ta.close(error);
2251 }
2252 
2253 ARB_ERROR GroupSearch::collectSpecies(const QueriedGroups& groups, CollectMode cmode, SpeciesNames& species) {
2254  SearchedTreeContainer searched_tree;
2255  collect_searched_trees(gb_main, trees_to_search, searched_tree);
2256 
2257  ARB_ERROR error;
2258  for (SearchedTreeIter t = searched_tree.begin(); t != searched_tree.end() && !error; ++t) {
2259  GBDATAset groupsFoundInTree;
2260  for (FoundGroupCIter g = groups.begin(); g != groups.end(); ++g) {
2261  if (t->get_tree_data() == g->get_tree_data()) {
2262  groupsFoundInTree.insert(g->get_pointer());
2263  }
2264  }
2265 
2266  if (!groupsFoundInTree.empty()) {
2267  // iterate over tree and insert or intersect species from each group with set
2268  GroupSearchRoot *troot = t->get_tree_root();
2269 
2270  ARB_edge start = rootEdge(troot);
2271  ARB_edge e = start;
2272  do {
2273  if (e.is_inner_edge() && e.get_type() != EDGE_TO_ROOT) {
2274  TreeNode *node = e.dest();
2275  if (node->is_normal_group()) {
2276  if (groupsFoundInTree.find(node->gb_node) != groupsFoundInTree.end()) {
2277  // iterate all leafs in subtree and store in 'speciesInGroup'
2278  SpeciesNames speciesInGroup;
2279  ARB_edge sub = e;
2280  ARB_edge stop = sub.inverse();
2281 
2282  while (sub != stop) {
2283  if (sub.is_edge_to_leaf()) {
2284  TreeNode *leaf = sub.dest();
2285  if (leaf->name) speciesInGroup.insert(leaf->name);
2286  }
2287  sub = sub.next();
2288  }
2289 
2290  if (species.empty()) { // simply add first group
2291  gs_assert(!speciesInGroup.empty()); // tree broken?
2292  species = speciesInGroup;
2293  }
2294  else { // intersect or unite two groups
2295  SpeciesNames combined;
2296  if (cmode == INTERSECT) {
2297  set_intersection(
2298  speciesInGroup.begin(), speciesInGroup.end(),
2299  species.begin(), species.end(),
2300  // combined.begin()
2301  inserter(combined, combined.begin())
2302  );
2303 
2304  if (combined.empty()) {
2305  error = "No species is member of ALL groups";
2306  }
2307  }
2308  else {
2309  gs_assert(cmode == UNITE);
2310  set_union(
2311  speciesInGroup.begin(), speciesInGroup.end(),
2312  species.begin(), species.end(),
2313  // combined.begin()
2314  inserter(combined, combined.begin())
2315  );
2316  }
2317  species = combined;
2318  }
2319  }
2320  }
2321  }
2322  e = e.next();
2323  }
2324  while (e != start && !error);
2325  }
2326  }
2327  return error;
2328 }
2329 
2330 static void set_marks_of(const SpeciesNames& targetSpecies, GBDATA *gb_main, GroupMarkMode mode) {
2331  if (!targetSpecies.empty()) {
2332  size_t found = 0;
2333  for (GBDATA *gb_species = GBT_first_species(gb_main);
2334  gb_species;
2335  gb_species = GBT_next_species(gb_species))
2336  {
2337  const char *name = GBT_get_name_or_description(gb_species);
2338  if (targetSpecies.find(name) != targetSpecies.end()) {
2339  ++found;
2340  if (mode == GMM_INVERT) {
2341  UNCOVERED();
2342  GB_write_flag(gb_species, !GB_read_flag(gb_species));
2343  }
2344  else {
2345  UNCOVERED();
2346  GB_write_flag(gb_species, mode == GMM_MARK);
2347  }
2348  }
2349  }
2350  size_t targetted = targetSpecies.size();
2351  if (found<targetted) {
2352  size_t zombies = targetted-found;
2353  GBT_message(gb_main, GBS_global_string("Warning: Refused to touch %zu zombies", zombies));
2354  }
2355  }
2356 }
2357 
2359  ARB_ERROR error;
2360  if (idx<found->size()) {
2361  QueriedGroups groups;
2362  groups.add_informed_group((*found)[idx]);
2363 
2364  SpeciesNames targetSpecies;
2365  error = collectSpecies(groups, UNITE, targetSpecies);
2366  if (!error) set_marks_of(targetSpecies, gb_main, mode);
2367  }
2368  return error;
2369 }
2371  // intersect == true -> affect only species which are members of ALL found groups
2372  ARB_ERROR error;
2373  if (has_results()) {
2374  SpeciesNames targetSpecies;
2375  error = collectSpecies(*found, cmode, targetSpecies);
2376  if (!error) set_marks_of(targetSpecies, gb_main, mode);
2377  }
2378  return error;
2379 }
2380 
2382  char *get_target_data(const QueryTarget& target, GB_ERROR& /*error*/) const OVERRIDE {
2383  const TargetGroup& target_group = DOWNCAST_REFERENCE(const TargetGroup, target);
2384  return strdup(target_group.get_group_name()); // retrieve group name
2385  }
2386  const char *get_name() const OVERRIDE { return "name"; }
2387 };
2388 
2390  char *get_target_data(const QueryTarget& target, GB_ERROR& /*error*/) const OVERRIDE {
2391  const TargetGroup& target_group = DOWNCAST_REFERENCE(const TargetGroup, target);
2392  const FoundGroup& group = target_group.get_group();
2393 
2394  return GBS_global_string_copy("%i", int(group.is_folded()));
2395  }
2396  const char *get_name() const OVERRIDE { return "folded"; }
2397 };
2398 
2400  char *get_target_data(const QueryTarget& target, GB_ERROR& /*error*/) const OVERRIDE {
2401  const TargetGroup& target_group = DOWNCAST_REFERENCE(const TargetGroup, target);
2402  return GBS_global_string_copy("%e", target_group.get_average_ingroup_distance());
2403  }
2404  const char *get_name() const OVERRIDE { return "AID"; }
2405 };
2406 
2408  char *get_target_data(const QueryTarget& target, GB_ERROR& /*error*/) const OVERRIDE {
2409  const TargetGroup& target_group = DOWNCAST_REFERENCE(const TargetGroup, target);
2410  return GBS_global_string_copy("%u", target_group.get_group_size());
2411  }
2412  const char *get_name() const OVERRIDE { return "size"; }
2413 };
2415  char *get_target_data(const QueryTarget& target, GB_ERROR& /*error*/) const OVERRIDE {
2416  const TargetGroup& target_group = DOWNCAST_REFERENCE(const TargetGroup, target);
2417  return GBS_global_string_copy("%i", target_group.get_keeledStateInfo());
2418  }
2419  const char *get_name() const OVERRIDE { return "keeled"; }
2420 };
2422  char *get_target_data(const QueryTarget& target, GB_ERROR& /*error*/) const OVERRIDE {
2423  const TargetGroup& target_group = DOWNCAST_REFERENCE(const TargetGroup, target);
2424  return GBS_global_string_copy("%u", target_group.get_zombie_count());
2425  }
2426  const char *get_name() const OVERRIDE { return "zombies"; }
2427 };
2429  bool percent;
2430 public:
2431  GroupMarkedKey(bool percent_) :
2432  percent(percent_)
2433  {}
2434  char *get_target_data(const QueryTarget& target, GB_ERROR& /*error*/) const OVERRIDE {
2435  const TargetGroup& target_group = DOWNCAST_REFERENCE(const TargetGroup, target);
2436 
2437  int marked = target_group.get_marked_count();
2438  if (percent) {
2439  int size = target_group.get_group_size();
2440  double pc = 100.0*marked/size;
2441  return GBS_global_string_copy("%5.2f", pc);
2442  }
2443 
2444  return GBS_global_string_copy("%u", marked);
2445  }
2446  const char *get_name() const OVERRIDE { return "marked"; }
2447 };
2448 
2450  const GroupSearch& group_search;
2451 public:
2452  NestingLevelKey(const GroupSearch& group_search_) :
2453  group_search(group_search_)
2454  {}
2455  char *get_target_data(const QueryTarget& target, GB_ERROR& /*error*/) const OVERRIDE {
2456  const TargetGroup& target_group = DOWNCAST_REFERENCE(const TargetGroup, target);
2457  const FoundGroup& group = target_group.get_group();
2458 
2459  return GBS_global_string_copy("%i", group_search.calc_nesting_level(group.get_pointer()));
2460  }
2461  const char *get_name() const OVERRIDE { return "nesting"; }
2462 };
2463 
2465  const GroupSearch& group_search;
2466  bool directParentOnly; // true -> direct parent; false -> any parent (iterates)
2467 
2468  mutable GBDATA *gb_parent;
2469  mutable int distance; // 1=direct parent, 2=parent of direct parent, ...
2470 
2471  static inline query_key_type detectKeyType(CriterionType ctype) {
2472  query_key_type qkt;
2473  switch (ctype) {
2474  case CT_PARENT_DIRECT: qkt = QKEY_EXPLICIT; break;
2475  case CT_PARENT_ANY: qkt = QKEY_ANY; break;
2476  case CT_PARENT_ALL: qkt = QKEY_ALL; break;
2477  default: gs_assert(0); break;
2478  }
2479  return qkt;
2480  }
2481 
2482 public:
2483  ParentGroupNameQueryKey(const GroupSearch& group_search_, CriterionType ctype) :
2484  QueryKey(detectKeyType(ctype)),
2485  group_search(group_search_),
2486  directParentOnly(ctype == CT_PARENT_DIRECT),
2487  gb_parent(NULp),
2488  distance(0)
2489  {
2490  gs_assert(ctype == CT_PARENT_DIRECT || ctype == CT_PARENT_ANY || ctype == CT_PARENT_ALL);
2491  }
2493 
2494  char *get_target_data(const QueryTarget& target, GB_ERROR& /*error*/) const OVERRIDE {
2495  // retrieve name of parent group
2496  if (!gb_parent) { // search first (direct) parent
2497  const TargetGroup& target_group = DOWNCAST_REFERENCE(const TargetGroup, target);
2498  const FoundGroup& group = target_group.get_group();
2499 
2500  gb_parent = group_search.get_parent_group(group.get_pointer());
2501  ++distance;
2502  if (!gb_parent) return strdup(""); // does not match "*"
2503  }
2504 
2505  FoundGroup parent(gb_parent);
2506  return strdup(parent.get_name());
2507  }
2508  const char *get_name() const OVERRIDE {
2509  // name of target (e.g. for reports)
2510  if (get_type() == QKEY_EXPLICIT) { // direct parent
2511  return "parent-name";
2512  }
2513 
2514  return GBS_global_string("parent-%i-name", distance);
2515  }
2516  bool iterate() const OVERRIDE {
2517  // iterate key to next entry (not for QKEY_EXPLICIT)
2518  if (gb_parent && get_type() != QKEY_EXPLICIT) {
2519  gb_parent = group_search.get_parent_group(gb_parent);
2520  ++distance;
2521  return gb_parent;
2522  }
2523  return false;
2524  }
2525  void reset() const OVERRIDE {
2526  // reset iteration
2527  gb_parent = NULp;
2528  distance = 0;
2529  }
2530 
2531 };
2532 
2534  query_operator aqo = ILLEGAL;
2535 
2536  if (query_expr.isNull()) {
2537  aqo = OR; // first is always OR
2538  }
2539  else {
2540  switch (op) {
2541  case CO_AND: aqo = AND; break;
2542  case CO_OR: aqo = OR; break;
2543  case CO_IGNORE:
2544  return; // ignore this expression
2545  }
2546  }
2547 
2548  QueryKeyPtr key;
2549  switch (type) {
2550  case CT_NAME: key = new GroupNameQueryKey; break;
2551  case CT_FOLDED: key = new GroupFoldedKey; break;
2552  case CT_NESTING_LEVEL: key = new NestingLevelKey(*this); break;
2553  case CT_SIZE: key = new GroupSizeKey; break;
2554  case CT_MARKED: key = new GroupMarkedKey(false); break;
2555  case CT_MARKED_PC: key = new GroupMarkedKey(true); break;
2556  case CT_ZOMBIES: key = new GroupZombiesKey; break;
2557 
2558  case CT_PARENT_DIRECT:
2559  case CT_PARENT_ANY:
2560  case CT_PARENT_ALL: key = new ParentGroupNameQueryKey(*this, type); break;
2561 
2562  case CT_AID: key = new GroupAIDkey; break;
2563  case CT_KEELED: key = new GroupKeeledKey; break;
2564  }
2565 
2566  QueryExpr *qe = new QueryExpr(aqo, key, mtype == CM_MISMATCH, expression);
2567  if (query_expr.isNull()) { // store 1st
2568  query_expr = qe;
2569  }
2570  else { // append others
2571  query_expr->append(qe);
2572  }
2573 }
2575  query_expr.setNull();
2576 }
2577 
2578 
2579 // --------------------------------------------------------------------------------
2580 
2581 #ifdef UNIT_TESTS
2582 #ifndef TEST_UNIT_H
2583 #include <test_unit.h>
2584 #endif
2585 
2586 enum GroupListType {
2587  GLT_NAME,
2588  GLT_NAME_TREE,
2589  GLT_NAME_SIZE,
2590  GLT_NAME_AID,
2591  GLT_CLUST_NT, // cluster, name + tree
2592  GLT_NAME_FOLD, // shows foldings state
2593  GLT_NAME_AND_PARENT, // shows parent relation (using ParentCache)
2594  GLT_KNAME_NEST, // shows keeled state and nesting
2595 };
2596 
2597 static arb_test::match_expectation groupListingIs(const QueriedGroups& foundGroups, GroupListType type, const char *expected_entries) {
2598  using namespace arb_test;
2599 
2600  ParentCache& pcache = GroupSearch::get_common()->get_parent_cache();
2601 
2602  StrArray entries;
2603  for (FoundGroupCIter g = foundGroups.begin(); g != foundGroups.end(); ++g) {
2604  switch (type) {
2605  case GLT_NAME:
2606  entries.put(strdup(g->get_name()));
2607  break;
2608 
2609  case GLT_NAME_TREE:
2610  entries.put(GBS_global_string_copy("%s/%s", g->get_name(), g->get_tree_name()));
2611  break;
2612 
2613  case GLT_NAME_SIZE:
2614  entries.put(GBS_global_string_copy("%s(%i)", g->get_name(), g->get_size()));
2615  break;
2616 
2617  case GLT_NAME_AID:
2618  entries.put(GBS_global_string_copy("%s(%.4f)", g->get_name(), g->get_aid()));
2619  break;
2620 
2621  case GLT_CLUST_NT:
2622  entries.put(GBS_global_string_copy("%i/%s/%s", g->get_cluster_id(), g->get_name(), g->get_tree_name()));
2623  break;
2624 
2625  case GLT_NAME_FOLD: {
2626  const char *format = g->is_folded() ? "[%s]" : "%s";
2627  entries.put(GBS_global_string_copy(format, g->get_name()));
2628  break;
2629  }
2630  case GLT_NAME_AND_PARENT: {
2631  GBDATA *gb_parent = pcache.lookupParent(g->get_pointer());
2632  if (gb_parent) {
2633  entries.put(GBS_global_string_copy("%s<%s>", FoundGroup(gb_parent).get_name(), g->get_name()));
2634  }
2635  else {
2636  entries.put(strdup(g->get_name()));
2637  }
2638  break;
2639  }
2640  case GLT_KNAME_NEST: {
2641  int kstate = g->get_keeled();
2642  const char *kprefix = kstate ? (kstate == 1 ? "!" : "?") : "";
2643  entries.put(GBS_global_string_copy("%s%s(L%i)", kprefix, g->get_name(), g->get_nesting()));
2644  break;
2645  }
2646  }
2647  }
2648 
2649  SmartCharPtr found_entriesP = GBT_join_strings(entries, '*');
2650  const char *found_entries = &*found_entriesP;
2651  return that(found_entries).is_equal_to(expected_entries);
2652 }
2653 
2654 static arb_test::match_expectation speciesInGroupsAre(GroupSearch& gs, CollectMode cmode, const char *expected_species) {
2655  using namespace arb_test;
2656  expectation_group fulfilled;
2657 
2658  SpeciesNames species;
2659  {
2660  const QueriedGroups& groups = gs.get_results();
2661  ARB_ERROR error = gs.collectSpecies(groups, cmode, species);
2662  fulfilled.add(doesnt_report_error(error));
2663  }
2664 
2665  ConstStrArray entries;
2666  for (SpeciesNames::const_iterator n = species.begin(); n != species.end(); ++n) {
2667  entries.put(n->c_str());
2668  }
2669  entries.sort(GB_string_comparator, NULp);
2670 
2671  SmartCharPtr contained_speciesP = GBT_join_strings(entries, ',');
2672  const char *contained_species = &*contained_speciesP;
2673  fulfilled.add(that(contained_species).is_equal_to(expected_species));
2674 
2675  return all().ofgroup(fulfilled);
2676 }
2677 
2678 static arb_test::match_expectation resultListingIs(GroupSearch& gs, GroupListType type, const char *expected_entries) {
2679  using namespace arb_test;
2680 
2681  const QueriedGroups& results = gs.get_results();
2683 
2684  return groupListingIs(results, type, expected_entries);
2685 }
2686 
2687 static arb_test::match_expectation hasOrder(const GroupSearch& gs, const char *expected_order) {
2688  using namespace arb_test;
2689 
2690  const int MAX_ORDER = 20;
2691  char found_order[MAX_ORDER];
2692  int off = 0;
2693 
2694  const SortCriteria& order = gs.inspect_order();
2695  for (SortCriteria::const_iterator i = order.begin(); i != order.end(); ++i) {
2696  char c = '?';
2697  switch (*i) {
2698  case GSC_NONE: c = '_'; break;
2699  case GSC_NAME: c = 'N'; break;
2700  case GSC_TREENAME: c = 'T'; break;
2701  case GSC_TREEORDER: c = 'O'; break;
2702  case GSC_REVERSE: c = '!'; break;
2703  case GSC_HIT_REASON: c = 'R'; break; // @@@ untested
2704  case GSC_NESTING: c = 'G'; break; // --- dito ---
2705  case GSC_SIZE: c = 'S'; break; // --- dito ---
2706  case GSC_MARKED: c = 'M'; break; // --- dito ---
2707  case GSC_MARKED_PC: c = '%'; break; // --- dito ---
2708  case GSC_CLUSTER: c = 'C'; break;
2709  case GSC_AID: c = 'A'; break;
2710  case GSC_KEELED: c = 'k'; break;
2711  }
2712  found_order[off++] = c;
2713  }
2714  gs_assert(off<MAX_ORDER);
2715  found_order[off] = 0;
2716  return that(found_order).is_equal_to(expected_order);
2717 }
2718 
2719 static arb_test::match_expectation addingCriterionProduces(GroupSearch& gs, GroupSortCriterion crit, const char *expected_order, const char *expected_entries) {
2720  using namespace arb_test;
2721  expectation_group fulfilled;
2722 
2723  gs.addSortCriterion(crit);
2724 
2725  fulfilled.add(hasOrder(gs, expected_order));
2726  fulfilled.add(resultListingIs(gs, GLT_NAME_TREE, expected_entries));
2727 
2728  return all().ofgroup(fulfilled);
2729 }
2730 
2731 static int refreshes_traced = 0;
2732 static void trace_refresh_cb() { ++refreshes_traced; }
2733 
2734 void TEST_group_search() {
2735  GB_shell shell;
2736  GBDATA *gb_main = GB_open("../../demo.arb", "r");
2737 
2738  GroupSearchCallback traceRefresh_cb = makeGroupSearchCallback(trace_refresh_cb);
2739  refreshes_traced = 0;
2740 
2741  {
2742  GroupSearch allGroups(gb_main, traceRefresh_cb);
2743  TEST_EXPECT(allGroups.get_results().empty());
2744 
2745  allGroups.perform_search(GSM_FIND);
2746  TEST_EXPECT(!allGroups.get_results().empty());
2747  TEST_EXPECT_EQUAL(allGroups.get_results().size(), 28);
2748  TEST_EXPECTATION(resultListingIs(allGroups, GLT_NAME_TREE,
2749  "last/tree_test*another group/tree_test*outer/tree_test*inner/tree_test*test/tree_test*outer/tree_test*test/tree_test*xx/tree_test*"
2750  "outer/tree_tree2*g2/tree_tree2*xx/tree_tree2*test/tree_tree2*outer/tree_tree2*inner/tree_tree2*test/tree_tree2*"
2751  "zombsub/tree_zomb*zomb/tree_zomb*ZOMB/tree_zomb*dup/tree_zomb*inner outer group/tree_zomb*inner group/tree_zomb*outer group/tree_zomb*g4/tree_zomb*g3/tree_zomb*g2/tree_zomb*xx/tree_zomb*yy/tree_zomb*eee/tree_zomb"
2752  ));
2753 
2754  TEST_EXPECTATION(hasOrder(allGroups, ""));
2755  allGroups.addSortCriterion(GSC_NAME); // sort by name
2756  TEST_EXPECTATION(hasOrder(allGroups, "N"));
2757  TEST_EXPECTATION(resultListingIs(allGroups, GLT_NAME_TREE,
2758  "ZOMB/tree_zomb*" // @@@ should be sorted case insensitive
2759  "another group/tree_test*dup/tree_zomb*eee/tree_zomb*"
2760  "g2/tree_tree2*g2/tree_zomb*"
2761  "g3/tree_zomb*g4/tree_zomb*"
2762  "inner/tree_test*inner/tree_tree2*" // order is stable
2763  "inner group/tree_zomb*inner outer group/tree_zomb*last/tree_test*"
2764  "outer/tree_test*outer/tree_test*outer/tree_tree2*outer/tree_tree2*" // order is stable
2765  "outer group/tree_zomb*"
2766  "test/tree_test*test/tree_test*test/tree_tree2*test/tree_tree2*" // order is stable
2767  "xx/tree_test*xx/tree_tree2*xx/tree_zomb*" // order is stable
2768  "yy/tree_zomb*zomb/tree_zomb*zombsub/tree_zomb"
2769  ));
2770 
2771  // search only in tree_tree2
2772  TreeNameSet tree2;
2773  tree2.insert("tree_tree2");
2774  allGroups.setSearchRange(tree2);
2775  allGroups.perform_search(GSM_FIND);
2776  TEST_EXPECT_EQUAL(allGroups.get_results().size(), 7);
2777  TEST_EXPECTATION(hasOrder(allGroups, "N")); // results still sorted by name (sort criteria are not reset by new search)
2778  TEST_EXPECTATION(resultListingIs(allGroups, GLT_NAME_TREE, "g2/tree_tree2*inner/tree_tree2*outer/tree_tree2*outer/tree_tree2*test/tree_tree2*test/tree_tree2*xx/tree_tree2"));
2779  }
2780 
2781  {
2782  GroupSearch some(gb_main, traceRefresh_cb);
2783 
2784  some.addQueryExpression(CO_OR, CT_NAME, CM_MATCH, "*ou*");
2785 
2786  some.perform_search(GSM_FIND);
2787  TEST_EXPECTATION(resultListingIs(some, GLT_NAME, "another group*outer*outer*outer*outer*inner outer group*inner group*outer group"));
2788  TEST_EXPECT_EQUAL(some.get_results().get_column_widths().name, 17);
2789 
2790  // test 2nd filter
2791  some.forgetQExpressions();
2792  some.addQueryExpression(CO_OR, CT_NAME, CM_MATCH, "*er");
2793  some.perform_search(GSM_FIND);
2794  TEST_EXPECTATION(resultListingIs(some, GLT_NAME_TREE, "outer/tree_test*inner/tree_test*outer/tree_test*outer/tree_tree2*outer/tree_tree2*inner/tree_tree2"));
2795  TEST_EXPECT_EQUAL(some.get_results().get_column_widths().name, 5);
2796 
2797  {
2798  // test order
2799  const char *BY_NAME_FWD = "inner/tree_test*inner/tree_tree2*outer/tree_test*outer/tree_test*outer/tree_tree2*outer/tree_tree2";
2800  const char *BY_NAME_REV = "outer/tree_test*outer/tree_test*outer/tree_tree2*outer/tree_tree2*inner/tree_test*inner/tree_tree2";
2801 
2802  TEST_EXPECTATION(addingCriterionProduces(some, GSC_NAME, "N", BY_NAME_FWD));
2803  TEST_EXPECTATION(addingCriterionProduces(some, GSC_REVERSE, "!N", BY_NAME_REV));
2804  TEST_EXPECTATION(addingCriterionProduces(some, GSC_NAME, "N", BY_NAME_FWD));
2805 
2806  // test multiple "reverse" criteria
2807  TEST_EXPECTATION(addingCriterionProduces(some, GSC_REVERSE, "!N", BY_NAME_REV));
2808  TEST_EXPECTATION(addingCriterionProduces(some, GSC_REVERSE, "N", BY_NAME_FWD));
2809  TEST_EXPECTATION(addingCriterionProduces(some, GSC_REVERSE, "!N", BY_NAME_REV));
2810 
2811  // test sort by treename
2812  TEST_EXPECTATION(addingCriterionProduces(some, GSC_TREENAME, "T!N", "outer/tree_test*outer/tree_test*inner/tree_test*outer/tree_tree2*outer/tree_tree2*inner/tree_tree2"));
2813  TEST_EXPECTATION(addingCriterionProduces(some, GSC_REVERSE, "!T!N", "inner/tree_tree2*outer/tree_tree2*outer/tree_tree2*inner/tree_test*outer/tree_test*outer/tree_test"));
2814 
2815  // test sort by tree-order (as specified in tree-admin)
2816  TEST_EXPECTATION(addingCriterionProduces(some, GSC_TREEORDER, "O!T!N", "inner/tree_test*outer/tree_test*outer/tree_test*inner/tree_tree2*outer/tree_tree2*outer/tree_tree2"));
2817  TEST_EXPECTATION(addingCriterionProduces(some, GSC_REVERSE, "!O!T!N", "outer/tree_tree2*outer/tree_tree2*inner/tree_tree2*outer/tree_test*outer/tree_test*inner/tree_test"));
2818 
2819  some.forgetSortCriteria();
2820  }
2821 
2822  // combine both filters (conjunction will only report 'outer')
2823  some.addQueryExpression(CO_AND, CT_NAME, CM_MATCH, "*ou*");
2824  some.perform_search(GSM_FIND);
2825  TEST_EXPECTATION(resultListingIs(some, GLT_NAME_TREE, "outer/tree_test*outer/tree_test*outer/tree_tree2*outer/tree_tree2"));
2826 
2827  // test adding results
2828  some.forgetQExpressions();
2829  some.addQueryExpression(CO_OR, CT_NAME, CM_MATCH, "*xx*");
2830  some.perform_search(GSM_ADD);
2831  TEST_EXPECTATION(resultListingIs(some, GLT_NAME, "outer*outer*outer*outer*xx*xx*xx"));
2832 
2833  some.forgetQExpressions();
2834  some.addQueryExpression(CO_OR, CT_NAME, CM_MATCH, "*er*");
2835  some.perform_search(GSM_ADD); // check no duplicates are reported (filter also matches 'outer')
2836  TEST_EXPECTATION(resultListingIs(some, GLT_NAME, "outer*outer*outer*outer*xx*xx*xx*another group*inner*inner*inner outer group*inner group*outer group"));
2837 
2838  // test removing a single result
2839  {
2840  some.addSortCriterion(GSC_TREEORDER); // first change order to make removal comprehensible
2841  TEST_EXPECTATION(resultListingIs(some, GLT_NAME, "outer*outer*xx*another group*inner*outer*outer*xx*inner*xx*inner outer group*inner group*outer group"));
2842 
2843  const char *FIRST_XX_REMOVED = "outer*outer*another group*inner*outer*outer*xx*inner*xx*inner outer group*inner group*outer group";
2844  some.remove_hit(2); // remove first 'xx'
2845  TEST_EXPECTATION(resultListingIs(some, GLT_NAME, FIRST_XX_REMOVED));
2846  // test that out-of-bounds removals are NOOPs:
2847  some.remove_hit(-10); TEST_EXPECTATION(resultListingIs(some, GLT_NAME, FIRST_XX_REMOVED));
2848  some.remove_hit(100); TEST_EXPECTATION(resultListingIs(some, GLT_NAME, FIRST_XX_REMOVED));
2849  }
2850 
2851  // test keeping results
2852  some.forgetQExpressions();
2853  some.addQueryExpression(CO_OR, CT_NAME, CM_MATCH, "*ou*");
2854  some.perform_search(GSM_KEEP);
2855  TEST_EXPECTATION(resultListingIs(some, GLT_NAME, "outer*outer*another group*outer*outer*inner outer group*inner group*outer group"));
2856 
2857  // test removing results (also tests "mismatch")
2858  some.forgetQExpressions();
2859  some.addQueryExpression(CO_OR, CT_NAME, CM_MATCH, "outer");
2860  some.perform_search(GSM_REMOVE);
2861  TEST_EXPECTATION(resultListingIs(some, GLT_NAME, "another group*inner outer group*inner group*outer group"));
2862  }
2863 
2864  // test different search keys
2865  {
2866  GroupSearch keyed(gb_main, traceRefresh_cb);
2867  const char *TOP_GROUPS = "last*another group*outer*test*outer*outer*zombsub*dup*inner outer group";
2868 
2869  // CT_PARENT_DIRECT (direct parent group name)
2870  keyed.addQueryExpression(CO_OR, CT_PARENT_DIRECT, CM_MATCH, ""); // direct parent w/o name (=no direct parent)
2871  keyed.perform_search(GSM_FIND);
2872  TEST_EXPECTATION(resultListingIs(keyed, GLT_NAME_AND_PARENT, TOP_GROUPS)); // -> TOP_GROUPS
2873 
2874  keyed.forgetQExpressions();
2875  keyed.addQueryExpression(CO_OR, CT_PARENT_DIRECT, CM_MATCH, "/^[^ ]*ou[^ ]*$/"); // uses regular expression query
2876  keyed.perform_search(GSM_FIND);
2877  TEST_EXPECTATION(resultListingIs(keyed, GLT_NAME_AND_PARENT, "outer<inner>*outer<test>*outer<xx>*outer<g2>*outer<test>*outer<inner>*outer<test>"));
2878 
2879  // CT_PARENT_ANY
2880  keyed.forgetQExpressions();
2881  keyed.addQueryExpression(CO_OR, CT_PARENT_ANY, CM_MATCH, "|contains(\"ou\");contains(\" \")|equals(0)|minus");
2882  keyed.perform_search(GSM_FIND);
2883  TEST_EXPECTATION(resultListingIs(keyed, GLT_NAME_AND_PARENT, "outer<inner>*outer<test>*outer<xx>*outer<g2>*g2<xx>*outer<test>*test<outer>*outer<inner>*outer<test>"));
2884 
2885  // CT_PARENT_ALL
2886  keyed.forgetQExpressions();
2887  keyed.addQueryExpression(CO_OR, CT_PARENT_ALL, CM_MISMATCH, "/ou/"); // not inside group containing 'ou'
2888  keyed.addQueryExpression(CO_AND, CT_NAME, CM_MISMATCH, "/ou/"); // and not containing 'ou' itself
2889  keyed.perform_search(GSM_FIND);
2890  TEST_EXPECTATION(resultListingIs(keyed, GLT_NAME_AND_PARENT, "last*test*zombsub*zombsub<zomb>*zombsub<ZOMB>*dup"));
2891 
2892  // CT_NESTING_LEVEL
2893  keyed.forgetQExpressions();
2894  keyed.addQueryExpression(CO_OR, CT_NESTING_LEVEL, CM_MATCH, "<1"); // nesting level less than 1
2895  keyed.perform_search(GSM_FIND);
2896  TEST_EXPECTATION(resultListingIs(keyed, GLT_NAME_AND_PARENT, TOP_GROUPS)); // -> TOP_GROUPS
2897 
2898  keyed.forgetQExpressions();
2899  keyed.addQueryExpression(CO_OR, CT_NESTING_LEVEL, CM_MISMATCH, ">0"); // nesting level not above 0
2900  keyed.perform_search(GSM_FIND);
2901  TEST_EXPECTATION(resultListingIs(keyed, GLT_NAME_AND_PARENT, TOP_GROUPS)); // -> TOP_GROUPS
2902 
2903  keyed.forgetQExpressions();
2904  keyed.addQueryExpression(CO_OR, CT_NESTING_LEVEL, CM_MATCH, ">4"); // too high nesting level
2905  keyed.perform_search(GSM_FIND);
2906  TEST_EXPECTATION(resultListingIs(keyed, GLT_NAME_AND_PARENT, ""));
2907 
2908  keyed.forgetQExpressions();
2909  keyed.addQueryExpression(CO_OR, CT_NESTING_LEVEL, CM_MATCH, ">3"); // highest occurring nesting level
2910  keyed.perform_search(GSM_FIND);
2911  TEST_EXPECTATION(resultListingIs(keyed, GLT_NAME_AND_PARENT, "yy<eee>")); // one group with nesting level 4
2912 
2913  keyed.forgetQExpressions();
2914  keyed.addQueryExpression(CO_OR, CT_NESTING_LEVEL, CM_MATCH, ">2");
2915  keyed.perform_search(GSM_FIND);
2916  TEST_EXPECTATION(resultListingIs(keyed, GLT_NAME_AND_PARENT, "outer<inner>*g2<xx>*g2<yy>*yy<eee>")); // 1xL4 + 3xL3
2917 
2918  keyed.forgetQExpressions();
2919  keyed.addQueryExpression(CO_OR, CT_NESTING_LEVEL, CM_MATCH, ">1");
2920  keyed.addQueryExpression(CO_AND, CT_NESTING_LEVEL, CM_MATCH, "<4");
2921  keyed.perform_search(GSM_FIND);
2922  TEST_EXPECTATION(resultListingIs(keyed, GLT_NAME_AND_PARENT, "g2<xx>*test<outer>*outer<inner>*outer group<g4>*outer group<g3>*outer group<g2>*g2<xx>*g2<yy>")); // 5x L2 + 3x L3
2923 
2924  keyed.forgetQExpressions();
2925  keyed.addQueryExpression(CO_OR, CT_NESTING_LEVEL, CM_MATCH, "2");
2926  keyed.perform_search(GSM_FIND);
2927  TEST_EXPECTATION(resultListingIs(keyed, GLT_NAME_AND_PARENT, "g2<xx>*test<outer>*outer group<g4>*outer group<g3>*outer group<g2>")); // 5x L2
2928 
2929  // CT_FOLDED
2930  const char *EXPANDED_GROUPS = "last*outer*outer<inner>*outer*outer*zombsub";
2931  keyed.forgetQExpressions();
2932  keyed.addQueryExpression(CO_OR, CT_FOLDED, CM_MATCH, "0");
2933  keyed.perform_search(GSM_FIND);
2934  TEST_EXPECTATION(resultListingIs(keyed, GLT_NAME_AND_PARENT, EXPANDED_GROUPS));
2935 
2936  keyed.forgetQExpressions();
2937  keyed.addQueryExpression(CO_OR, CT_NAME /*does not matter*/, CM_MISMATCH, "|readdb(grouped)|equals(1)"); // directly access field of group-container
2938  keyed.perform_search(GSM_FIND);
2939  TEST_EXPECTATION(resultListingIs(keyed, GLT_NAME_AND_PARENT, EXPANDED_GROUPS));
2940 
2941  // CT_SIZE
2942  keyed.forgetQExpressions();
2943  keyed.addQueryExpression(CO_OR, CT_SIZE, CM_MATCH, ">12"); // find bigger groups
2944  keyed.perform_search(GSM_FIND);
2945  TEST_EXPECTATION(resultListingIs(keyed, GLT_NAME_SIZE, "another group(29)*outer(15)*outer(47)*zombsub(14)*inner outer group(19)*outer group(15)"));
2946  keyed.addQueryExpression(CO_AND, CT_SIZE, CM_MATCH, "|rest(2)|equals(0)"); // with even groupsize only
2947  keyed.perform_search(GSM_FIND);
2948  TEST_EXPECTATION(resultListingIs(keyed, GLT_NAME_SIZE, "zombsub(14)")); // the only bigger group with an even number of members
2949 
2950  // CT_MARKED + CT_MARKED_PC
2951  keyed.forgetQExpressions();
2952  keyed.addQueryExpression(CO_OR, CT_MARKED, CM_MATCH, ">7"); // at least 8 marked species inside group
2953  keyed.perform_search(GSM_FIND);
2954  TEST_EXPECTATION(resultListingIs(keyed, GLT_NAME, "another group*outer*inner outer group*outer group"));
2955 
2956  const char *COMPLETELY_MARKED_GROUPS = "test*xx*xx*g4*xx*eee";
2957  keyed.forgetQExpressions();
2958  keyed.addQueryExpression(CO_OR, CT_MARKED_PC, CM_MATCH, ">99"); // completely marked groups (more than 99%)
2959  keyed.perform_search(GSM_FIND);
2960  TEST_EXPECTATION(resultListingIs(keyed, GLT_NAME, COMPLETELY_MARKED_GROUPS));
2961  keyed.forgetQExpressions();
2962  keyed.addQueryExpression(CO_OR, CT_MARKED_PC, CM_MISMATCH, "<100"); // completely marked groups (not less than 100%)
2963  keyed.perform_search(GSM_FIND);
2964  TEST_EXPECTATION(resultListingIs(keyed, GLT_NAME, COMPLETELY_MARKED_GROUPS));
2965  keyed.forgetQExpressions();
2966  keyed.addQueryExpression(CO_OR, CT_MARKED_PC, CM_MATCH, "100"); // completely marked groups (equal to 100%)
2967  keyed.perform_search(GSM_FIND);
2968  TEST_EXPECTATION__BROKEN(resultListingIs(keyed, GLT_NAME, COMPLETELY_MARKED_GROUPS), // @@@ matching % for equality does not work as expected
2969  resultListingIs(keyed, GLT_NAME, ""));
2970 
2971 
2972  keyed.forgetQExpressions();
2973  keyed.addQueryExpression(CO_OR, CT_MARKED, CM_MISMATCH, "0"); // groups with marked..
2974  keyed.addQueryExpression(CO_AND, CT_MARKED_PC, CM_MATCH, "<50"); // ..but less than 50%
2975  keyed.perform_search(GSM_FIND);
2976  TEST_EXPECTATION(resultListingIs(keyed, GLT_NAME, "outer*outer*test"));
2977 
2978  // CT_ZOMBIES
2979  keyed.forgetQExpressions();
2980  keyed.addQueryExpression(CO_OR, CT_ZOMBIES, CM_MISMATCH, "0"); // groups with zombies
2981  keyed.perform_search(GSM_FIND);
2982  TEST_EXPECTATION(resultListingIs(keyed, GLT_NAME, "zombsub*zomb*ZOMB"));
2983 
2984  // CT_AID
2985  keyed.forgetQExpressions();
2986  keyed.addQueryExpression(CO_OR, CT_AID, CM_MATCH, ">1"); // groups with high AID
2987  keyed.perform_search(GSM_FIND);
2988  TEST_EXPECTATION(resultListingIs(keyed, GLT_NAME_AID, "outer(1.0996)*outer(1.1605)"));
2989 
2990  keyed.forgetQExpressions();
2991  keyed.addQueryExpression(CO_OR, CT_AID, CM_MATCH, "<.1"); // groups with low AID
2992  keyed.perform_search(GSM_FIND);
2993  keyed.addSortCriterion(GSC_AID);
2994  keyed.addSortCriterion(GSC_REVERSE);
2995  TEST_EXPECTATION(resultListingIs(keyed, GLT_NAME_AID, "xx(0.0786)*xx(0.0786)*g3(0.0665)*dup(0.0399)*inner group(0.0259)"));
2996 
2997  // CT_KEELED is tested in TEST_keeled_group_search()
2998  }
2999 
3000  TEST_EXPECT_EQUAL(refreshes_traced, 0); // no refresh traced up to here
3001 
3002  // test group-actions:
3003 
3004  {
3005  refreshes_traced = 0;
3006 
3007  GroupSearch misc(gb_main, traceRefresh_cb);
3008 
3009  misc.addQueryExpression(CO_OR, CT_NAME, CM_MATCH, "*e*");
3010  misc.addQueryExpression(CO_AND, CT_NAME, CM_MISMATCH, "* *");
3011  misc.perform_search(GSM_FIND);
3012  {
3013  const char *ACI_add_tag = "\"[TAG] \";dd";
3014 
3015  const char *BEFORE_RENAME = "outer*inner*test*outer*test*outer*test*outer*inner*test*eee";
3016  const char *OUTER_PREFIXED = "[TAG] outer*inner*test*outer*test*outer*test*outer*inner*test*eee";
3017 
3018  TEST_EXPECTATION(resultListingIs(misc, GLT_NAME, BEFORE_RENAME));
3019 
3020  // test renaming groups:
3021  TEST_EXPECT_NO_ERROR(misc.rename_group(0, ACI_add_tag)); TEST_EXPECTATION(resultListingIs(misc, GLT_NAME, OUTER_PREFIXED)); // prefix first 'outer'
3022  TEST_EXPECT_NO_ERROR(misc.rename_group(0, "\"\"")); TEST_EXPECTATION(resultListingIs(misc, GLT_NAME, OUTER_PREFIXED)); // test empty ACI-result does not rename anything
3023 
3024  TEST_EXPECT_NO_ERROR(misc.rename_found_groups("\"[X]\";dd;\" \"")); // prefix '[X]' to all found groups + suffix space (which are trimmed away afterwards)
3025  TEST_EXPECTATION(resultListingIs(misc, GLT_NAME, "[X][TAG] outer*[X]inner*[X]test*[X]outer*[X]test*[X]outer*[X]test*[X]outer*[X]inner*[X]test*[X]eee"));
3026 
3027  // test errors get reported:
3028  TEST_EXPECT_ERROR_CONTAINS(misc.rename_group(0, ":x"), "no '=' found");
3029  TEST_EXPECT_ERROR_CONTAINS(misc.rename_found_groups(":x"), "no '=' found");
3030 
3031  TEST_EXPECT_NO_ERROR(misc.rename_found_groups("/\\[.*\\]//")); // remove any prefixes
3032 
3033  TEST_EXPECT_NO_ERROR(misc.rename_found_groups("dd;\"_\";hitidx;\"/\";hitcount")); // append "_index/hitcount" to groupname
3034  TEST_EXPECTATION(resultListingIs(misc, GLT_NAME, "outer_1/11*inner_2/11*test_3/11*outer_4/11*test_5/11*outer_6/11*test_7/11*outer_8/11*inner_9/11*test_10/11*eee_11/11"));
3035 
3036  TEST_EXPECT_NO_ERROR(misc.rename_found_groups("command(\"/_.*$//\")|dd;\"_\";markedInGroup;\"/\";groupSize")); // replace suffix with "marked/size"
3037  TEST_EXPECTATION(resultListingIs(misc, GLT_NAME, "outer_6/11*inner_4/5*test_7/7*outer_7/15*test_0/4*outer_20/47*test_6/12*outer_6/11*inner_4/5*test_2/6*eee_3/3"));
3038 
3039  TEST_EXPECT_NO_ERROR(misc.rename_found_groups(":_*=_L*(|nesting)\\=*(|aid)")); // replace suffix with nesting level and aid
3040  TEST_EXPECTATION(resultListingIs(misc, GLT_NAME, "outer_L0=0.695293*inner_L1=0.269289*test_L0=0.160956*outer_L0=1.099650*test_L1=0.591923*outer_L0=1.160535*test_L1=0.726679*outer_L2=0.704352*inner_L3=0.265516*test_L1=0.303089*eee_L4=0.229693"));
3041 
3042  // undo renaming groups (to avoid need to change tests below)
3043  TEST_EXPECT_NO_ERROR(misc.rename_found_groups("/_.*$//")); // remove all behind '_'
3044  TEST_EXPECTATION(resultListingIs(misc, GLT_NAME, BEFORE_RENAME));
3045 
3046  TEST_EXPECT_EQUAL(refreshes_traced, 7); // amount of result-list refreshes that would happen (1 * rename_group() + 6 * rename_found_groups(); one rename_group did nothing!)
3047  refreshes_traced = 0;
3048  }
3049 
3050  {
3051  GroupSearch all(gb_main, traceRefresh_cb); // run a 2nd search
3052  GroupSearch none(gb_main, traceRefresh_cb); // run a 3rd search
3053  GroupSearch few(gb_main, traceRefresh_cb); // run a 4th search
3054 
3055  // test folding single groups
3056  TEST_EXPECTATION( resultListingIs(misc, GLT_NAME_FOLD, "outer*inner*[test]*outer*[test]*outer*[test]*[outer]*[inner]*[test]*[eee]")); // shows current folding state
3057  TEST_EXPECT_NO_ERROR(misc.fold_group(0, GFM_TOGGLE)); TEST_EXPECTATION(resultListingIs(misc, GLT_NAME_FOLD, "[outer]*inner*[test]*outer*[test]*outer*[test]*[outer]*[inner]*[test]*[eee]")); // fold 1st 'outer'
3058  TEST_EXPECT_NO_ERROR(misc.fold_group(0, GFM_TOGGLE)); TEST_EXPECTATION(resultListingIs(misc, GLT_NAME_FOLD, "outer*inner*[test]*outer*[test]*outer*[test]*[outer]*[inner]*[test]*[eee]")); // unfold 1st 'outer'
3059 
3060  TEST_EXPECT_EQUAL(refreshes_traced, 2); // 2 result-list refreshes would happen (one for each fold_group())
3061  refreshes_traced = 0;
3062 
3063  none.addQueryExpression(CO_OR, CT_NAME, CM_MISMATCH, "*"); // no such group
3064  all.addQueryExpression(CO_OR, CT_NAME, CM_MATCH, "*"); // matches all groups
3065  few.addQueryExpression(CO_OR, CT_NAME, CM_MATCH, "inner");
3066 
3067  none.perform_search(GSM_FIND);
3068  few.perform_search(GSM_FIND);
3069  all.perform_search(GSM_FIND);
3070 
3071  TEST_EXPECTATION(resultListingIs(none, GLT_NAME, "")); // shows no results
3072  TEST_EXPECTATION(resultListingIs(few, GLT_NAME_FOLD, "inner*[inner]")); // shows some results
3073  // shows current folding state (of all groups from all trees):
3074  TEST_EXPECTATION(resultListingIs(all, GLT_NAME_FOLD, "last*[another group]*outer*inner*[test]*outer*[test]*[xx]*outer*[g2]*[xx]*[test]*[outer]*[inner]*[test]*zombsub*[zomb]*[ZOMB]*[dup]*[inner outer group]*[inner group]*[outer group]*[g4]*[g3]*[g2]*[xx]*[yy]*[eee]"));
3075 
3076  TEST_EXPECT_EQUAL(refreshes_traced, 0);
3077 
3078  // test folding listed groups
3079  // (Note: that results used for folding and for test differ!)
3080  TEST_EXPECT_NO_ERROR( few.fold_found_groups(GFM_EXPANDREC)); TEST_EXPECTATION(resultListingIs(all, GLT_NAME_FOLD, "last*[another group]*outer*inner*[test]*outer*[test]*[xx]*" "outer*[g2]*[xx]*test*outer*inner*[test]*" "zombsub*[zomb]*[ZOMB]*[dup]*[inner outer group]*[inner group]*[outer group]*[g4]*[g3]*[g2]*[xx]*[yy]*[eee]")); // [A] only unfolds 2nd inner and 2 of its 3 parent groups
3081  TEST_EXPECT_NO_ERROR(misc.fold_found_groups(GFM_EXPANDREC)); TEST_EXPECTATION(resultListingIs(all, GLT_NAME_FOLD, "last*[another group]*outer*inner*test*outer*test*[xx]*" "outer*[g2]*[xx]*test*outer*inner*test*" "zombsub*[zomb]*[ZOMB]*[dup]*inner outer group*[inner group]*outer group*[g4]*[g3]*g2*[xx]*yy*eee")); // 'xx' and 'g2' remain folded
3082  TEST_EXPECT_NO_ERROR(misc.fold_found_groups(GFM_COLLAPSE)); TEST_EXPECTATION(resultListingIs(all, GLT_NAME_FOLD, "last*[another group]*[outer]*[inner]*[test]*[outer]*[test]*[xx]*" "[outer]*[g2]*[xx]*[test]*[outer]*[inner]*[test]*" "zombsub*[zomb]*[ZOMB]*[dup]*inner outer group*[inner group]*outer group*[g4]*[g3]*g2*[xx]*yy*[eee]")); // 'last' remains unfolded
3083  TEST_EXPECT_NO_ERROR( few.fold_found_groups(GFM_EXPANDREC_COLLREST)); TEST_EXPECTATION(resultListingIs(all, GLT_NAME_FOLD, "[last]*[another group]*outer*inner*[test]*[outer]*[test]*[xx]*" "outer*[g2]*[xx]*test*outer*inner*[test]*" "[zombsub]*[zomb]*[ZOMB]*[dup]*[inner outer group]*[inner group]*[outer group]*[g4]*[g3]*[g2]*[xx]*[yy]*[eee]")); // similar to line [A], but 'last' gets folded
3084  TEST_EXPECT_NO_ERROR(none.fold_found_groups(GFM_EXPANDREC_COLLREST)); TEST_EXPECTATION(resultListingIs(all, GLT_NAME_FOLD, "[last]*[another group]*[outer]*[inner]*[test]*[outer]*[test]*[xx]*" "[outer]*[g2]*[xx]*[test]*[outer]*[inner]*[test]*" "[zombsub]*[zomb]*[ZOMB]*[dup]*[inner outer group]*[inner group]*[outer group]*[g4]*[g3]*[g2]*[xx]*[yy]*[eee]")); // unfold none+collapse rest = fold all
3085  TEST_EXPECT_NO_ERROR(misc.fold_found_groups(GFM_EXPANDPARENTS)); TEST_EXPECTATION(resultListingIs(all, GLT_NAME_FOLD, "[last]*[another group]*outer*[inner]*[test]*outer*[test]*[xx]*" "outer*[g2]*[xx]*test*outer*[inner]*[test]*" "[zombsub]*[zomb]*[ZOMB]*[dup]*inner outer group*[inner group]*outer group*[g4]*[g3]*g2*[xx]*yy*[eee]")); // unfold all groups containing listed groups
3086 
3087  TEST_EXPECT_EQUAL(refreshes_traced, 16); // @@@ want less refreshes!
3088  refreshes_traced = 0;
3089 
3090  {
3091  GroupSearch group2(gb_main, traceRefresh_cb); // run a 5th search
3092  group2.addQueryExpression(CO_OR, CT_NAME, CM_MATCH, "g2"); // group 'g2' exists in 2 tree; species overlap, but are not identical
3093  group2.perform_search(GSM_FIND);
3094 
3095  GB_transaction ta(gb_main);
3096 
3097  // test retrieval of species contained in groups:
3098  TEST_EXPECTATION(speciesInGroupsAre(none, INTERSECT, ""));
3099 
3100  // groups 'inner' are identical in all trees:
3101  const char *INNER_SPECIES = "McpCapri,McpMyco2,McpMycoi,McpSpeci,SpiMelli";
3102  TEST_EXPECTATION(speciesInGroupsAre(few, UNITE, INNER_SPECIES));
3103  TEST_EXPECTATION(speciesInGroupsAre(few, INTERSECT, INNER_SPECIES));
3104 
3105  TEST_EXPECTATION(speciesInGroupsAre(group2, UNITE, "AnaAbact,BacMegat,BacPaste,CloTyro2,CloTyro4,CloTyrob,StaAureu,StaEpide"));
3106  TEST_EXPECTATION(speciesInGroupsAre(group2, INTERSECT, "AnaAbact,BacMegat,BacPaste," "CloTyro4,CloTyrob,StaAureu"));
3107  }
3108  }
3109 
3110  TEST_EXPECTATION(resultListingIs(misc, GLT_NAME_AND_PARENT, "outer*outer<inner>*test*outer*outer<test>*outer*outer<test>*test<outer>*outer<inner>*outer<test>*yy<eee>")); // format is "parent<child>"
3111 
3112  // test deleting groups:
3113  TEST_EXPECT_NO_ERROR(misc.delete_group(6)); TEST_EXPECTATION(resultListingIs(misc, GLT_NAME, "outer*inner*test*outer*test*outer*outer*inner*test*eee")); // delete 1st 'test' from 'tree_test2' (DEL_TEST)
3114  TEST_EXPECT_NO_ERROR(misc.delete_group(3)); TEST_EXPECTATION(resultListingIs(misc, GLT_NAME, "outer*inner*test*test*outer*outer*inner*test*eee")); // delete 2nd 'outer' from 'tree_tree' (DEL_OUTER)
3115 
3116  // deleting invalid index only returns an error:
3117  TEST_EXPECT_ERROR_CONTAINS(misc.delete_group(100), "out-of-bounds");
3118  TEST_EXPECT_ERROR_CONTAINS(misc.delete_group(-1), "out-of-bounds");
3119 
3120  TEST_EXPECT_EQUAL(refreshes_traced, 2); // 2 result-list refreshes would happen (one for each delete_group())
3121  refreshes_traced = 0;
3122 
3123  TEST_EXPECTATION(resultListingIs(misc, GLT_NAME_AND_PARENT, "outer*outer<inner>*test*test*outer*outer<outer>*outer<inner>*outer<test>*yy<eee>")); // 'test' between 'outer<outer>' got removed
3124 
3125  // delete all (but one) groups named 'outer':
3126  misc.forgetQExpressions();
3127  misc.addQueryExpression(CO_OR, CT_NAME, CM_MATCH, "outer");
3128  misc.perform_search(GSM_FIND);
3129  TEST_EXPECTATION(resultListingIs(misc, GLT_NAME_TREE, "outer/tree_test*outer/tree_tree2*outer/tree_tree2")); // also tests that 'outer' was deleted from DB; see .@DEL_OUTER
3130 
3131  misc.remove_hit(1); // will not get deleted
3132  TEST_EXPECTATION(resultListingIs(misc, GLT_NAME_TREE, "outer/tree_test*outer/tree_tree2"));
3133 
3134  TEST_EXPECT_NO_ERROR(misc.delete_found_groups()); // now delete all listed groups
3135  TEST_EXPECTATION(resultListingIs(misc, GLT_NAME_TREE, "")); // result-list is empty now
3136 
3137  misc.perform_search(GSM_FIND); // search again
3138  TEST_EXPECTATION(resultListingIs(misc, GLT_NAME_TREE, "outer/tree_tree2")); // hit removed before deleting listed still exists in DB
3139 
3140  TEST_EXPECT_EQUAL(refreshes_traced, 1); // only one refresh triggered for deletion of all listed groups
3141  }
3142 
3143  {
3144  refreshes_traced = 0;
3145 
3146  GroupSearch outer(gb_main, traceRefresh_cb);
3147  outer.addQueryExpression(CO_OR, CT_NAME, CM_MATCH, "test");
3148  outer.perform_search(GSM_FIND);
3149  TEST_EXPECTATION(resultListingIs(outer, GLT_NAME_TREE, "test/tree_test*test/tree_test*test/tree_tree2")); // also tests that 'test' was deleted from DB; see .@DEL_TEST
3150 
3151  // test result-update callbacks (triggered by DB-changes)
3152  { // delete tree_tree2:
3153  GB_transaction ta(gb_main);
3154  GBDATA *gb_tree = GBT_find_tree(gb_main, "tree_tree2");
3155  TEST_REJECT_NULL(gb_tree);
3156  TEST_EXPECT_NO_ERROR(GB_delete(gb_tree));
3157  }
3158  TEST_EXPECT_EQUAL(refreshes_traced, 1); // one modifying TA => only one refresh callback triggered
3159  TEST_EXPECTATION(resultListingIs(outer, GLT_NAME_TREE, "test/tree_test*test/tree_test")); // all results referring 'tree_tree2' were removed
3160  }
3161 
3162 
3163  GB_close(gb_main);
3164 }
3165 
3166 void TEST_keeled_group_search() {
3167  GB_shell shell;
3168  GBDATA *gb_main = GB_open("TEST_trees.arb", "rw");
3169 
3170  GroupSearchCallback traceRefresh_cb = makeGroupSearchCallback(trace_refresh_cb);
3171  refreshes_traced = 0;
3172  {
3173  GB_transaction ta(gb_main);
3174 
3175  GroupSearch allGroups(gb_main, traceRefresh_cb);
3176  {
3177  GroupSearch keeledGroups(gb_main, traceRefresh_cb);
3178  GroupSearch normalGroups(gb_main, traceRefresh_cb);
3179 
3180  TEST_EXPECT(allGroups.get_results().empty());
3181  TEST_EXPECT(keeledGroups.get_results().empty());
3182  TEST_EXPECT(normalGroups.get_results().empty());
3183 
3184  // CT_KEELED:
3185  keeledGroups.addQueryExpression(CO_OR, CT_KEELED, CM_MISMATCH, "0"); // find keeled groups
3186  normalGroups.addQueryExpression(CO_OR, CT_KEELED, CM_MATCH, "0"); // find normal groups
3187 
3188  allGroups.perform_search(GSM_FIND);
3189  keeledGroups.perform_search(GSM_FIND);
3190  normalGroups.perform_search(GSM_FIND);
3191 
3192  TEST_EXPECT(!allGroups.get_results().empty());
3193  TEST_EXPECT(!keeledGroups.get_results().empty());
3194  TEST_EXPECT(!normalGroups.get_results().empty());
3195 
3196  TEST_EXPECT_EQUAL(allGroups.get_results().size(), 21);
3197  TEST_EXPECT_EQUAL(allGroups.get_results().size(),
3198  keeledGroups.get_results().size()+normalGroups.get_results().size());
3199  TEST_EXPECT_EQUAL(keeledGroups.get_results().size(), 6);
3200  TEST_EXPECT_EQUAL(normalGroups.get_results().size(), 15);
3201 
3202  TEST_EXPECTATION(resultListingIs(allGroups, GLT_NAME_TREE,
3203  "test/tree_test*"
3204  "outer/tree_tree2*g2/tree_tree2*"
3205  "outer/tree_removal*g2 [was: test]/tree_removal*"
3206  "lower/tree_groups*low2/tree_groups*twoleafs/tree_groups*low1/tree_groups*upper/tree_groups*"
3207  "twoleafs/tree_keeled*low2/tree_keeled*lower/tree_keeled*upper/tree_keeled*low1/tree_keeled*"
3208  "low2/tree_keeled_2*twoleafs/tree_keeled_2*lower/tree_keeled_2*upper/tree_keeled_2*low1/tree_keeled_2*allButOne/tree_keeled_2" // finds "keeled group at leaf" 'allButOne'; see also ../../ARBDB/adtree.cxx@HIDDEN_KEELED_GROUP
3209  ));
3210 
3211  TEST_EXPECTATION(resultListingIs(keeledGroups, GLT_KNAME_NEST,
3212  "!twoleafs(L0)*!low2(L1)*?lower(L2)*" // tree_keeled
3213  "!low2(L0)*?lower(L1)*!allButOne(L2)" // tree_keeled_2
3214  ));
3215  }
3216 
3217  TreeNameSet keeledTrees;
3218  keeledTrees.insert("tree_keeled");
3219  keeledTrees.insert("tree_keeled_2");
3220 
3221  allGroups.setSearchRange(keeledTrees);
3222  allGroups.perform_search(GSM_FIND);
3223 
3224  TEST_EXPECTATION(resultListingIs(allGroups, GLT_NAME_AND_PARENT,
3225  // tree_keeled:
3226  "twoleafs*twoleafs<low2>*low2<lower>*lower<upper>*"
3227  "low2<low1>*"
3228 
3229  // tree_keeled_2:
3230  "low2*"
3231  "twoleafs*"
3232  "low2<lower>*"
3233  "lower<upper>*" // keeled group 'lower' encloses 'upper'
3234  "low2<low1>*"
3235  "low1<allButOne>"
3236  ));
3237 
3238  // test folding of keeled groups:
3239  TEST_EXPECTATION(resultListingIs(allGroups, GLT_NAME_FOLD,
3240  "twoleafs*low2*lower*upper*low1*" // tree_keeled
3241  "low2*twoleafs*lower*upper*low1*allButOne" // tree_keeled_2
3242  ));
3243 
3244  TEST_EXPECT_NO_ERROR(allGroups.fold_group(0, GFM_TOGGLE)); // fold 'twoleafs'
3245  TEST_EXPECT_NO_ERROR(allGroups.fold_group(2, GFM_TOGGLE)); // fold 'lower' -> does as well fold 'upper' (overlayed groups)
3246 
3247  TEST_EXPECTATION(resultListingIs(allGroups, GLT_NAME_FOLD,
3248  "[twoleafs]*low2*[lower]*[upper]*low1*" // tree_keeled
3249  "low2*twoleafs*lower*upper*low1*allButOne" // tree_keeled_2
3250  ));
3251 
3252  TEST_EXPECT_NO_ERROR(allGroups.fold_group(3, GFM_TOGGLE)); // unfold 'upper' -> does as well unfold 'lower' (overlayed groups)
3253  TEST_EXPECT_NO_ERROR(allGroups.fold_group(10, GFM_TOGGLE));
3254 
3255  TEST_EXPECTATION(resultListingIs(allGroups, GLT_NAME_FOLD,
3256  "[twoleafs]*low2*lower*upper*low1*" // tree_keeled
3257  "low2*twoleafs*lower*upper*low1*[allButOne]" // tree_keeled_2
3258  ));
3259 
3260  TEST_EXPECT_NO_ERROR(allGroups.fold_group(0, GFM_TOGGLE));
3261  TEST_EXPECT_NO_ERROR(allGroups.fold_group(10, GFM_TOGGLE));
3262 
3263  TEST_EXPECTATION(resultListingIs(allGroups, GLT_NAME_FOLD,
3264  "twoleafs*low2*lower*upper*low1*" // tree_keeled
3265  "low2*twoleafs*lower*upper*low1*allButOne" // tree_keeled_2
3266  ));
3267 
3268  TEST_EXPECTATION(resultListingIs(allGroups, GLT_NAME_AID,
3269  // tree_keeled:
3270  "twoleafs(1.4310)*low2(1.4436)*lower(1.0288)*upper(1.0288)*low1(1.1200)*"
3271 
3272  // tree_keeled_2:
3273  "low2(1.4436)*twoleafs(0.0087)*lower(1.0288)*upper(1.0288)*low1(1.1200)*"
3274  "allButOne(0.0000)" // 1 member -> zero AID
3275  ));
3276 
3277  keeledTrees.insert("tree_groups");
3278  allGroups.setSearchRange(keeledTrees);
3279  allGroups.perform_search(GSM_FIND);
3280 
3281  TEST_EXPECTATION(resultListingIs(allGroups, GLT_KNAME_NEST,
3282  // tree_groups:
3283  "lower(L0)*low2(L1)*twoleafs(L2)*low1(L1)*upper(L0)*"
3284 
3285  // tree_keeled:
3286  "!twoleafs(L0)*!low2(L1)*?lower(L2)*upper(L3)*"
3287  "low1(L2)*"
3288 
3289  // tree_keeled_2:
3290  "!low2(L0)*"
3291  "twoleafs(L0)*"
3292  "?lower(L1)*upper(L2)*low1(L1)*!allButOne(L2)"
3293  ));
3294 
3295  TEST_EXPECTATION(resultListingIs(allGroups, GLT_NAME_SIZE,
3296  // tree_groups:
3297  "lower(10)*low2(3)*twoleafs(2)*low1(7)*upper(5)*"
3298 
3299  // tree_keeled:
3300  "twoleafs(13)*"
3301  "low2(12)*"
3302  "lower(5)*upper(5)*"
3303  "low1(7)*"
3304 
3305  // tree_keeled_2:
3306  "low2(12)*"
3307  "twoleafs(2)*"
3308  "lower(5)*"
3309  "upper(5)*low1(7)*"
3310  "allButOne(1)" // only 1 species!
3311  ));
3312 
3313  allGroups.addSortCriterion(GSC_KEELED);
3314  TEST_EXPECTATION(resultListingIs(allGroups, GLT_KNAME_NEST,
3315  "?lower(L2)*?lower(L1)*!twoleafs(L0)*!low2(L1)*!low2(L0)*!allButOne(L2)*lower(L0)*low2(L1)*twoleafs(L2)*low1(L1)*upper(L0)*upper(L3)*low1(L2)*twoleafs(L0)*upper(L2)*low1(L1)"
3316  ));
3317  }
3318 
3319  GB_close(gb_main);
3320 }
3321 
3322 
3323 
3324 static arb_test::match_expectation does_map_index(const SymmetricMatrixMapper& mm, int x, int y, int lin) {
3325  using namespace arb_test;
3326  expectation_group fulfilled;
3327 
3328  fulfilled.add(that(mm.linear_index(x, y)).is_equal_to(lin));
3329  fulfilled.add(that(mm.linear_index(y, x)).is_equal_to(lin));
3330 
3331  int rx, ry;
3332  mm.to_xy(lin, rx, ry);
3333  if (x>y) swap(x, y);
3334 
3335  fulfilled.add(that(rx).is_equal_to(x));
3336  fulfilled.add(that(ry).is_equal_to(y));
3337 
3338  return all().ofgroup(fulfilled);
3339 }
3340 
3341 void TEST_SymmetricMatrixMapper() {
3342  {
3343  SymmetricMatrixMapper m2(2);
3344  TEST_EXPECT_EQUAL(m2.linear_size(), 1);
3345  TEST_EXPECTATION(does_map_index(m2, 0, 1, 0));
3346  }
3347  {
3348  SymmetricMatrixMapper m3(3);
3349  TEST_EXPECT_EQUAL(m3.linear_size(), 3);
3350  TEST_EXPECTATION(does_map_index(m3, 0, 1, 0));
3351  TEST_EXPECTATION(does_map_index(m3, 2, 0, 1));
3352  TEST_EXPECTATION(does_map_index(m3, 2, 1, 2));
3353  }
3354  {
3355  SymmetricMatrixMapper m100(100);
3356  TEST_EXPECT_EQUAL(m100.linear_size(), 4950);
3357  TEST_EXPECTATION(does_map_index(m100, 0, 1, 0));
3358  TEST_EXPECTATION(does_map_index(m100, 49, 50, 1274));
3359  TEST_EXPECTATION(does_map_index(m100, 51, 50, 1274+51));
3360  TEST_EXPECTATION(does_map_index(m100, 99, 98, 4949));
3361  }
3362 }
3363 
3364 void TEST_group_duplicate_detection() {
3365  GB_shell shell;
3366  GBDATA *gb_main = GB_open("../../demo.arb", "r");
3367 
3368  GroupSearchCallback traceRefresh_cb = makeGroupSearchCallback(trace_refresh_cb);
3369 
3370  {
3371  refreshes_traced = 0;
3372 
3373  GroupSearch search(gb_main, traceRefresh_cb);
3374  search.addSortCriterion(GSC_NAME);
3375  search.addSortCriterion(GSC_TREENAME);
3376 
3377  search.setDupCriteria(true, DNC_WHOLENAME, GB_MIND_CASE, DLC_SAME_TREE, 2);
3378  search.perform_search(GSM_FIND);
3379  TEST_EXPECTATION(hasOrder(search, "TN")); // treename, groupname
3380  TEST_EXPECTATION(resultListingIs(search, GLT_CLUST_NT,
3381  "1/outer/tree_test*"
3382  "1/outer/tree_test*"
3383  "2/test/tree_test*"
3384  "2/test/tree_test*"
3385  "3/outer/tree_tree2*"
3386  "3/outer/tree_tree2*"
3387  "4/test/tree_tree2*"
3388  "4/test/tree_tree2"
3389  ));
3390 
3391  search.addSortCriterion(GSC_REVERSE);
3392  search.addSortCriterion(GSC_CLUSTER);
3393  search.addSortCriterion(GSC_REVERSE);
3394 
3395  search.setDupCriteria(true, DNC_WHOLENAME, GB_MIND_CASE, DLC_ANYWHERE, 2);
3396  search.perform_search(GSM_FIND);
3397  TEST_EXPECTATION(hasOrder(search, "!C!TN")); // cluster(rev), treename, groupname
3398  TEST_EXPECTATION(resultListingIs(search, GLT_CLUST_NT,
3399  "5/g2/tree_tree2*"
3400  "5/g2/tree_zomb*"
3401  "4/xx/tree_test*"
3402  "4/xx/tree_tree2*"
3403  "4/xx/tree_zomb*"
3404  "3/test/tree_test*"
3405  "3/test/tree_test*"
3406  "3/test/tree_tree2*"
3407  "3/test/tree_tree2*"
3408  "2/inner/tree_test*"
3409  "2/inner/tree_tree2*"
3410  "1/outer/tree_test*"
3411  "1/outer/tree_test*"
3412  "1/outer/tree_tree2*"
3413  "1/outer/tree_tree2"
3414  ));
3415 
3416  search.setDupCriteria(false, DNC_WHOLENAME, GB_MIND_CASE, DLC_ANYWHERE, 2); // search "unique" groups
3417  search.perform_search(GSM_FIND);
3418  TEST_EXPECTATION(resultListingIs(search, GLT_CLUST_NT,
3419  "0/another group/tree_test*"
3420  "0/last/tree_test*"
3421  "0/ZOMB/tree_zomb*"
3422  "0/dup/tree_zomb*"
3423  "0/eee/tree_zomb*"
3424  "0/g3/tree_zomb*"
3425  "0/g4/tree_zomb*"
3426  "0/inner group/tree_zomb*"
3427  "0/inner outer group/tree_zomb*"
3428  "0/outer group/tree_zomb*"
3429  "0/yy/tree_zomb*"
3430  "0/zomb/tree_zomb*"
3431  "0/zombsub/tree_zomb"
3432  ));
3433 
3434  search.addSortCriterion(GSC_NAME);
3435  search.addSortCriterion(GSC_TREENAME);
3436  search.addSortCriterion(GSC_CLUSTER);
3437 
3438  search.setDupCriteria(true, DNC_WHOLENAME, GB_MIND_CASE, DLC_DIFF_TREE, 2);
3439  search.perform_search(GSM_FIND);
3440  TEST_EXPECTATION(hasOrder(search, "CTN")); // cluster, treename, groupname
3441  TEST_EXPECTATION(resultListingIs(search, GLT_CLUST_NT,
3442  "1/outer/tree_test*"
3443  "1/outer/tree_test*"
3444  "1/outer/tree_tree2*"
3445  "1/outer/tree_tree2*"
3446  "2/inner/tree_test*"
3447  "2/inner/tree_tree2*"
3448  "3/test/tree_test*"
3449  "3/test/tree_test*"
3450  "3/test/tree_tree2*"
3451  "3/test/tree_tree2*"
3452  "4/xx/tree_test*"
3453  "4/xx/tree_tree2*"
3454  "4/xx/tree_zomb*"
3455  "5/g2/tree_tree2*"
3456  "5/g2/tree_zomb"
3457  ));
3458 
3459  search.setDupCriteria(true, DNC_WHOLENAME, GB_MIND_CASE, DLC_DIFF_TREE, 3); // expect hits in 3 diff. trees
3460  search.perform_search(GSM_FIND);
3461  TEST_EXPECTATION(resultListingIs(search, GLT_CLUST_NT, // Note: does not add 'outer' or 'test' (they occur 4 times, but only in 2 trees!)
3462  "1/xx/tree_test*"
3463  "1/xx/tree_tree2*"
3464  "1/xx/tree_zomb"
3465  ));
3466 
3467  // --------------------------------------------
3468  // test DNC_WORDWISE name comparison:
3469 
3470  const char *word_sep = " ";
3471  WordSet no_words_ignored;
3472  search.setDupCriteria(true, DNC_WORDWISE, GB_MIND_CASE, 1, no_words_ignored, word_sep, DLC_ANYWHERE, 2);
3473  search.perform_search(GSM_FIND);
3474  TEST_EXPECTATION(resultListingIs(search, GLT_CLUST_NT,
3475  "1/another group/tree_test*"
3476  "1/inner group/tree_zomb*"
3477  "1/inner outer group/tree_zomb*"
3478  "1/outer group/tree_zomb*"
3479 
3480  "2/outer/tree_test*"
3481  "2/outer/tree_test*"
3482  "2/outer/tree_tree2*"
3483  "2/outer/tree_tree2*"
3484 
3485  "3/test/tree_test*"
3486  "3/test/tree_test*"
3487  "3/test/tree_tree2*"
3488  "3/test/tree_tree2*"
3489 
3490  "4/xx/tree_test*"
3491  "4/xx/tree_tree2*"
3492  "4/xx/tree_zomb*"
3493 
3494  "5/inner/tree_test*"
3495  "5/inner/tree_tree2*"
3496 
3497  "6/g2/tree_tree2*"
3498  "6/g2/tree_zomb"
3499  ));
3500 
3501  search.setDupCriteria(true, DNC_WORDWISE, GB_MIND_CASE, 2, no_words_ignored, word_sep, DLC_ANYWHERE, 2);
3502  search.perform_search(GSM_FIND);
3503  TEST_EXPECTATION(resultListingIs(search, GLT_CLUST_NT,
3504  "1/inner group/tree_zomb*"
3505  "1/inner outer group/tree_zomb"
3506  ));
3507 
3508  // rename one group (spaces->commas) to test special word separators
3509  {
3510  GB_transaction ta(gb_main);
3511  TEST_EXPECT_NO_ERROR(search.rename_group(0, "/ /,/"));
3512  TEST_EXPECT_EQUAL(search.get_results()[0].get_name(), "inner,group");
3513  }
3514 
3515  search.setDupCriteria(true, DNC_WORDWISE, GB_MIND_CASE, 2, no_words_ignored, word_sep, DLC_ANYWHERE, 2);
3516  search.perform_search(GSM_FIND);
3517  TEST_EXPECTATION(resultListingIs(search, GLT_CLUST_NT, // rename of group causes a change of detected cluster
3518  "1/inner outer group/tree_zomb*"
3519  "1/outer group/tree_zomb"
3520  ));
3521 
3522 
3523  word_sep = ", "; // <<<------------------------------ commas separate words from now on!
3524 
3525  search.setDupCriteria(true, DNC_WORDWISE, GB_MIND_CASE, 2, no_words_ignored, word_sep, DLC_ANYWHERE, 2);
3526  search.perform_search(GSM_FIND);
3527  TEST_EXPECTATION(resultListingIs(search, GLT_CLUST_NT,
3528  "1/inner outer group/tree_zomb*"
3529  "1/inner,group/tree_zomb"
3530  ));
3531 
3532  WordSet ignore_group;
3533  ignore_group.insert("Group");
3534 
3535  search.setDupCriteria(true, DNC_WORDWISE, GB_IGNORE_CASE, 1, ignore_group, word_sep, DLC_ANYWHERE, 2);
3536  search.perform_search(GSM_FIND);
3537  TEST_EXPECTATION(resultListingIs(search, GLT_CLUST_NT,
3538  "1/outer/tree_test*"
3539  "1/outer/tree_test*"
3540  "1/outer/tree_tree2*"
3541  "1/outer/tree_tree2*"
3542  "1/inner outer group/tree_zomb*"
3543  "1/outer group/tree_zomb*"
3544 
3545  "2/test/tree_test*"
3546  "2/test/tree_test*"
3547  "2/test/tree_tree2*"
3548  "2/test/tree_tree2*"
3549 
3550  "3/inner/tree_test*"
3551  "3/inner/tree_tree2*"
3552  "3/inner,group/tree_zomb*"
3553 
3554  "4/xx/tree_test*"
3555  "4/xx/tree_tree2*"
3556  "4/xx/tree_zomb*"
3557 
3558  "5/g2/tree_tree2*"
3559  "5/g2/tree_zomb*"
3560 
3561  "6/ZOMB/tree_zomb*"
3562  "6/zomb/tree_zomb"
3563  ));
3564 
3565  search.setDupCriteria(true, DNC_WORDWISE, GB_IGNORE_CASE, 2, ignore_group, word_sep, DLC_ANYWHERE, 2);
3566  search.perform_search(GSM_FIND);
3567  TEST_EXPECTATION(resultListingIs(search, GLT_CLUST_NT, "")); // none
3568 
3569  search.setDupCriteria(true, DNC_WORDWISE, GB_IGNORE_CASE, 1, ignore_group, "", DLC_ANYWHERE, 2); // empty word separator -> uses whole names
3570  search.perform_search(GSM_FIND);
3571  TEST_EXPECTATION(resultListingIs(search, GLT_CLUST_NT,
3572  "1/outer/tree_test*"
3573  "1/outer/tree_test*"
3574  "1/outer/tree_tree2*"
3575  "1/outer/tree_tree2*"
3576 
3577  "2/test/tree_test*"
3578  "2/test/tree_test*"
3579  "2/test/tree_tree2*"
3580  "2/test/tree_tree2*"
3581 
3582  "3/xx/tree_test*"
3583  "3/xx/tree_tree2*"
3584  "3/xx/tree_zomb*"
3585 
3586  "4/inner/tree_test*"
3587  "4/inner/tree_tree2*"
3588 
3589  "5/g2/tree_tree2*"
3590  "5/g2/tree_zomb*"
3591 
3592  "6/ZOMB/tree_zomb*"
3593  "6/zomb/tree_zomb"
3594  ));
3595 
3596  // rename more groups to test cluster-search based on 3 words and extension based on 2 words
3597  {
3598  GB_transaction ta(gb_main);
3599  TEST_EXPECT_NO_ERROR(search.rename_group(0, "/outer/group inner outer/"));
3600  TEST_EXPECT_NO_ERROR(search.rename_group(1, "/outer/group outer/"));
3601  TEST_EXPECT_NO_ERROR(search.rename_group(2, "/outer/outer group/"));
3602  TEST_EXPECT_EQUAL(search.get_results()[0].get_name(), "group inner outer");
3603  TEST_EXPECT_EQUAL(search.get_results()[1].get_name(), "group outer");
3604  TEST_EXPECT_EQUAL(search.get_results()[2].get_name(), "outer group");
3605  }
3606 
3607  search.setDupCriteria(true, DNC_WORDWISE, GB_IGNORE_CASE, 2, no_words_ignored, word_sep, DLC_ANYWHERE, 2);
3608  search.perform_search(GSM_FIND);
3609  TEST_EXPECTATION(resultListingIs(search, GLT_CLUST_NT,
3610  "1/group inner outer/tree_test*" // cluster based on 3 words gets extended by groups matching 2 of these words ("group" and "outer")
3611  "1/group outer/tree_test*" // (note that group containing 'inner' and 'group' is discarded, because resulting cluster would be smaller)
3612  "1/outer group/tree_tree2*"
3613  "1/inner outer group/tree_zomb*"
3614  "1/outer group/tree_zomb"
3615  ));
3616 
3617  TEST_EXPECT_EQUAL(refreshes_traced, 2); // 2 renames
3618  }
3619  GB_close(gb_main);
3620 }
3621 
3622 static double bruteForce_calc_average_ingroup_distance(GroupSearchTree *node) {
3623  unsigned leafs = node->get_leaf_count();
3624 
3625  if (leafs == 1) return 0.0; // single leaf -> zero distance
3626 
3627  ARB_edge last = parentEdge(node->get_leftson());
3628  ARB_edge start = parentEdge(node->get_rightson()).inverse();
3629 
3630  if (start == last) {
3631  gs_assert(start.get_type() == ROOT_EDGE);
3632  start = start.next();
3633  }
3634 
3635  unsigned pairs = 0;
3636  double dist_sum = 0.0;
3637 
3638  for (ARB_edge e1 = start; e1 != last; e1 = e1.next()) {
3639  if (e1.is_edge_to_leaf()) {
3640  for (ARB_edge e2 = e1.next(); e2 != last; e2 = e2.next()) {
3641  if (e2.is_edge_to_leaf()) {
3642  dist_sum += e1.dest()->intree_distance_to(e2.dest());
3643  ++pairs;
3644  }
3645  }
3646  }
3647  }
3648 
3649 #if defined(ASSERTION_USED)
3650  const unsigned calc_pairs = (leafs*(leafs-1))/2;
3651  gs_assert(pairs == calc_pairs);
3652 #endif
3653 
3654  return dist_sum/pairs;
3655 }
3656 
3657 #define TEST_EXPECT_PROPER_AID(node) do{ \
3658  const double EPSILON = 0.000001; \
3659  TEST_EXPECT_SIMILAR(bruteForce_calc_average_ingroup_distance(node), \
3660  (node)->get_average_ingroup_distance(), \
3661  EPSILON); \
3662  }while(0)
3663 
3664 void TEST_ingroup_distance() {
3665  GB_shell shell;
3666  GBDATA *gb_main = GB_open("TEST_trees.arb", "r");
3667 
3668  {
3669  GB_transaction ta(gb_main);
3670  SearchedTree stree("tree_test", gb_main);
3671 
3672  GroupSearchRoot *troot = stree.get_tree_root();
3673  TEST_REJECT(stree.failed_to_load());
3674 
3675  // get some specific nodes:
3676  GroupSearchTree *rootNode = troot->get_root_node();
3677  GroupSearchTree *leftSon = rootNode->get_leftson();
3678  GroupSearchTree *grandSon = leftSon->get_rightson();
3679 
3680  GroupSearchTree *someLeaf = grandSon->get_leftson();
3681  while (!someLeaf->is_leaf()) { // descent into bigger subtree => reaches subtree containing 2 leafs
3682  GroupSearchTree *L = someLeaf->get_leftson();
3683  GroupSearchTree *R = someLeaf->get_rightson();
3684 
3685  someLeaf = L->get_leaf_count() > R->get_leaf_count() ? L : R;
3686  }
3687 
3688  TEST_EXPECT_EQUAL(someLeaf->get_leaf_count(), 1);
3689 
3690  GroupSearchTree *minSubtree = someLeaf->get_father();
3691  TEST_EXPECT_EQUAL(minSubtree->get_leaf_count(), 2);
3692 
3693  // brute-force AID calculation:
3694  {
3695  const double EPSILON = 0.000001;
3696  TEST_EXPECT_SIMILAR(bruteForce_calc_average_ingroup_distance(someLeaf), 0.0, EPSILON);
3697  TEST_EXPECT_SIMILAR(bruteForce_calc_average_ingroup_distance(minSubtree), minSubtree->leftlen + minSubtree->rightlen, EPSILON);
3698  TEST_EXPECT_SIMILAR(bruteForce_calc_average_ingroup_distance(grandSon), 0.534927, EPSILON);
3699  TEST_EXPECT_SIMILAR(bruteForce_calc_average_ingroup_distance(leftSon), 0.976091, EPSILON);
3700  TEST_EXPECT_SIMILAR(bruteForce_calc_average_ingroup_distance(rootNode), 1.108438, EPSILON);
3701  }
3702 
3703  // calculate AID on-the-fly and compare with brute-force results
3704  TEST_EXPECT_PROPER_AID(someLeaf);
3705  TEST_EXPECT_PROPER_AID(minSubtree);
3706  TEST_EXPECT_PROPER_AID(grandSon);
3707  TEST_EXPECT_PROPER_AID(leftSon);
3708  TEST_EXPECT_PROPER_AID(rootNode);
3709 
3710  ARB_edge start = rootEdge(troot);
3711  for (ARB_edge e = start.next(); e != start; e = e.next()) {
3712  TEST_EXPECT_PROPER_AID(DOWNCAST(GroupSearchTree*, e.dest()));
3713  }
3714  }
3715  GB_close(gb_main);
3716 }
3717 
3718 #endif // UNIT_TESTS
3719 
3720 // --------------------------------------------------------------------------------
3721 
DupTreeCriterionType
Definition: group_search.h:303
const char * get_tree_name() const
void compute_tree() OVERRIDE
const char * GB_ERROR
Definition: arb_core.h:25
bool big_enough(const GroupCluster &cluster) const
static GB_ERROR grl_hitcount(GBL_command_arguments *args)
string result
FoundGroup & get_group()
GBDATA * GB_open(const char *path, const char *opent)
Definition: ad_load.cxx:1363
GB_TYPES type
const std::string & get_hit_reason() const
Definition: group_search.h:190
const char * get_group_display(const FoundGroup &g, bool show_tree_name) const
GroupSearchRoot * get_tree_root()
void inc_to_avoid_overflow(PINT x)
Definition: arb_progress.h:363
compare_by_criteria(const SortCriteria &by_)
void put(const char *elem)
Definition: arb_strarray.h:188
void forgetSortCriteria()
Definition: group_search.h:366
GroupClusterCIter begin() const
std::set< std::string > SpeciesNames
Definition: group_search.h:232
group_matcher all()
Definition: test_unit.h:1011
bool group_is_folded(GBDATA *gb_group)
int get_marked_pc() const
Definition: group_search.h:214
GBDATA * get_parent_group(GBDATA *gb_group) const
AliDataPtr format(AliDataPtr data, const size_t wanted_len, GB_ERROR &error)
Definition: insdel.cxx:615
unsigned get_leaf_count() const FINAL_OVERRIDE
#define TRIGGER_UPDATE_GROUP_RESULTS
Lazy< int,-1 > nesting
Definition: group_search.h:166
~GroupSearchRoot() FINAL_OVERRIDE
#define TEST_EXPECT_SIMILAR(expr, want, epsilon)
Definition: test_unit.h:1298
const SortCriteria & by
Definition: arbdb.h:65
long GB_read_int(GBDATA *gbd)
Definition: arbdb.cxx:729
bool empty() const
GBDATA * GB_child(GBDATA *father)
Definition: adquery.cxx:322
GB_ERROR GB_add_hierarchy_callback(GBDATA *gb_main, const char *db_path, GB_CB_TYPE type, const DatabaseCallback &dbcb)
Definition: ad_cb.cxx:432
#define implicated(hypothesis, conclusion)
Definition: arb_assert.h:289
static char * y[maxsp+1]
return string(buffer, length)
bool overlap_is_folded() const
GB_ERROR delete_group(size_t idx)
GBDATA * get_tree_data()
bool empty() const
Definition: group_search.h:266
const WordSet & get_ignored_words() const
NestingLevelKey(const GroupSearch &group_search_)
static void collect_searched_trees(GBDATA *gb_main, const TreeNameSet &trees_to_search, SearchedTreeContainer &searched_tree)
bool has_group_info() const
Definition: TreeNode.h:444
Definition: AP_filter.hxx:36
void addSortCriterion(GroupSortCriterion gsc)
GB_ERROR GB_add_callback(GBDATA *gbd, GB_CB_TYPE type, const DatabaseCallback &dbcb)
Definition: ad_cb.cxx:356
#define DOWNCAST_REFERENCE(totype, expr)
Definition: downcast.h:152
GroupSearchCommon * common
static void result_update_cb(GBDATA *, GroupSearchCommon *common)
void string_to_lower(string &s)
GB_ERROR delete_from_DB()
#define DEFINE_TREE_RELATIVES_ACCESSORS(TreeType)
Definition: TreeNode.h:613
void setDupCriteria(bool listDups, DupNameCriterionType ntype, GB_CASE sens, DupTreeCriterionType ttype, int min_cluster_size)
int get_tree_order() const
const FoundGroup * get_hit_group() const
GB_ERROR delete_found_groups()
match_expectation doesnt_report_error(const char *error)
Definition: test_unit.h:1105
const FoundGroup & get_group() const
GBDATA * GB_nextEntry(GBDATA *entry)
Definition: adquery.cxx:339
#define DEFINE_TREE_ROOT_ACCESSORS(RootType, TreeType)
Definition: TreeNode.h:610
const char * get_load_error() const
long
Definition: AW_awar.cxx:152
unsigned get_marked_count() const
bool contains_changed(GroupSearchCommon *common) const
void add(int v)
Definition: ClustalV.cxx:461
void inc_to(PINT x)
Definition: arb_progress.h:362
void find_and_deliverTo(QueriedGroups &toResult)
void buildInferableClusterStartingWith(int start_idx, GroupCluster &cluster)
ARB_edge_type get_type() const
Definition: TreeNode.h:766
const char * get_name() const OVERRIDE
ARB_ERROR set_marks_in_found_groups(GroupMarkMode mode, CollectMode cmode)
void allow_lookup() const
double get_average_ingroup_distance() const
TreeNode * GBT_read_tree(GBDATA *gb_main, const char *tree_name, TreeRoot *troot)
Definition: adtree.cxx:837
const char * get_name() const OVERRIDE
void inform_group(const GroupSearch &group_search, const string &hitReason)
bool isCorrectParent(TreeNode *node, GBDATA *gb_group, GBDATA *gb_parent_group)
static GB_ERROR grl_nesting(GBL_command_arguments *args)
int calc_nesting_level(GBDATA *gb_group) const
ARB_edge inverse() const
Definition: TreeNode.h:794
bool tree_matches(const GBDATA *data1, const GBDATA *data2) const
double get_average_ingroup_distance() const
const char * GBS_global_string(const char *templat,...)
Definition: arb_msg.cxx:203
const char * get_name() const OVERRIDE
void erase()
Definition: arb_strbuf.h:141
STL namespace.
void insert(int i)
int get_leaf_count() const
bool is_folded() const
SmartPtr< GroupSearchRoot > GroupSearchRootPtr
GroupSortCriterion
Definition: group_search.h:237
void cat(const char *from)
Definition: arb_strbuf.h:199
bool isNull() const
test if SmartPtr is NULp
Definition: smartptr.h:248
GroupRename_callenv(const QueriedGroups &queried_, int hit_idx_, const GBL_env &env_)
bool has_been_deleted(GBDATA *gb_node)
was_modified(GroupSearchCommon *common_)
void refresh_results_after_DBchanges()
const TreeNode * find_parent_with_groupInfo(bool skipKeeledBrothers=false) const
Definition: TreeNode.h:493
ARB_edge rootEdge(TreeRoot *root)
Definition: TreeNode.h:898
void findBestClusterBasedOnWords(int wanted_words, GroupCluster &best, arb_progress &progress_cluster, int &first_cluster_found_from_index)
bool already_delivered(int idx) const
CollectMode
Definition: group_search.h:72
GroupCluster(const GroupCluster &other)
static void set_marks_of(const SpeciesNames &targetSpecies, GBDATA *gb_main, GroupMarkMode mode)
#define ARRAY_ELEMS(array)
Definition: arb_defs.h:19
int name_matches(const GroupInfo &gi1, const GroupInfo &gi2) const
void setNull()
set SmartPtr to NULp
Definition: smartptr.h:251
Lazy< int,-1 > keeled
Definition: group_search.h:172
int max_cluster_start_index() const
GBDATA * GB_get_father(GBDATA *gbd)
Definition: arbdb.cxx:1722
const char * get_name() const OVERRIDE
GBDATA * get_gb_main() const
Definition: group_search.h:357
const GBL_call_env & get_callEnv() const
Definition: gb_aci.h:234
std::set< std::string > WordSet
Definition: group_search.h:309
int linear_index(int x, int y) const
GroupSearchTree(GroupSearchRoot *root)
#define DOWNCAST(totype, expr)
Definition: downcast.h:141
#define FINAL_OVERRIDE
Definition: cxxforward.h:114
ARB_ERROR fold_found_groups(GroupFoldingMode mode)
DupNameCriterionType get_name_type() const
GB_ERROR check_no_parameter(GBL_command_arguments *args)
Definition: gb_aci_impl.h:152
GB_ERROR GB_delete(GBDATA *&source)
Definition: arbdb.cxx:1916
int GB_string_comparator(const void *v0, const void *v1, void *)
Definition: arb_sort.cxx:47
ARB_edge next() const
Definition: TreeNode.h:804
static HelixNrInfo * start
ARB_edge parentEdge(TreeNode *son)
Definition: TreeNode.h:883
Lazy< int,-1 > marked
Definition: group_search.h:168
POS_TREE1 * father
Definition: probe_tree.h:39
unsigned long permutations(int elems)
GroupInfo(const FoundGroup &g, bool prep_wordwise, GB_CASE sens, const char *wordSeparators, const WordSet &ignored_words)
const double EPSILON
Definition: aw_position.hxx:73
GroupClusterCIter end() const
Lazy< int,-1 > size
Definition: group_search.h:167
GroupCluster(int num_of_groups)
size_t GB_read_string_count(GBDATA *gbd)
Definition: arbdb.cxx:916
GB_ERROR GB_await_error()
Definition: arb_msg.cxx:342
int get_keeledStateInfo() const
ARB_ERROR rename_by_ACI(const char *acisrt, const QueriedGroups &results, int hit_idx)
#define TEST_EXPECT(cond)
Definition: test_unit.h:1328
GroupSearchMode
Definition: group_search.h:320
static void set_species_data(GBDATA *gb_species_data_)
char * get_target_data(const QueryTarget &target, GB_ERROR &) const OVERRIDE
DupCriteria(bool listDups_, const DupNameCriterion &nameCrit_, DupTreeCriterionType ttype_, int minSize_)
std::set< GBDATA * > GBDATAset
Definition: group_search.h:229
const GroupSearchTree * get_clade() const
bool erase_deleted(GroupSearchCommon *common)
bool needs_eval() const
Definition: lazy.h:61
GB_CSTR GB_read_key_pntr(GBDATA *gbd)
Definition: arbdb.cxx:1656
const char * get_word_separators() const
const char * get_name() const OVERRIDE
bool isSet() const
test if SmartPtr is not NULp
Definition: smartptr.h:245
GBDATA * GB_create(GBDATA *father, const char *key, GB_TYPES type)
Definition: arbdb.cxx:1781
bool empty() const
GB_CASE get_sensitivity() const
GBDATA * gb_species_data
Definition: adname.cxx:33
bool aborted()
Definition: arb_progress.h:335
bool knows_details() const
Definition: group_search.h:199
list< Candidate > CandidateList
void addQueryExpression(CriterionOperator op, CriterionType type, CriterionMatch mtype, const char *expression)
static int group[MAXN+1]
Definition: ClustalV.cxx:65
vector< GroupInfo > GroupInfoVec
#define false
Definition: ureadseq.h:13
DupNameCriterionType
Definition: group_search.h:297
void forgetDupCriteria()
bool has_results() const
Definition: group_search.h:392
char * GBS_trim(const char *str)
Definition: adstring.cxx:947
GB_ERROR GBT_write_group_name(GBDATA *gb_group_name, const char *new_group_name, bool pedantic)
Definition: adtree.cxx:230
void deliverRest(QueriedGroups &toResult)
#define COMMAND_DROPS_INPUT_STREAMS(args)
Definition: gb_aci_impl.h:218
const char * GBS_readable_size(unsigned long long size, const char *unit_suffix)
Definition: arb_misc.cxx:23
LazyFloat< double > aid
Definition: group_search.h:170
#define TEST_REJECT(cond)
Definition: test_unit.h:1330
#define TEST_REJECT_NULL(n)
Definition: test_unit.h:1325
const QueriedGroups & queried
static void error(const char *msg)
Definition: mkptypes.cxx:96
std::set< std::string > TreeNameSet
Definition: group_search.h:318
unsigned get_group_size() const
GBDATA * GB_get_root(GBDATA *gbd)
Definition: arbdb.cxx:1740
GroupSearch(GBDATA *gb_main_, const GroupSearchCallback &redisplay_results_cb)
static void string2WordSet(const char *name, WordSet &words, const char *wordSeparators, const WordSet &ignored_words)
bool contains(int i) const
bool tree_is_loaded() const
query_key_type
Definition: query_expr.h:81
bool operator()(const FoundGroup &g)
ARB_ERROR rename_group(size_t idx, const char *acisrt)
int get_nesting() const
Definition: group_search.h:211
expectation_group & add(const expectation &e)
Definition: test_unit.h:812
static GB_ERROR grl_dupidx(GBL_command_arguments *args)
~TargetGroup() OVERRIDE
CONSTEXPR_INLINE_Cxx14 void swap(unsigned char &c1, unsigned char &c2)
Definition: ad_io_inline.h:19
has_been_deleted(GroupSearchCommon *common_)
size_t get_word_count() const
ASSERTING_CONSTEXPR_INLINE int info2bio(int infopos)
Definition: arb_defs.h:27
bool is_keeled_group() const
Definition: TreeNode.h:475
#define that(thing)
Definition: test_unit.h:1043
void track_max_widths(ColumnWidths &widths) const
const char * get_name() const
DupNameCriterion(DupNameCriterionType exact, GB_CASE sens_)
bool has_been_modified(GBDATA *gb_node)
set< int > GroupClusterSet
void notify_modified(GBDATA *gb_node)
int get_keeled() const
Definition: group_search.h:215
bool iterate() const OVERRIDE
FoundGroupCIter end() const
Definition: group_search.h:277
char * get_target_data(const QueryTarget &target, GB_ERROR &) const OVERRIDE
void deliverCluster(const GroupCluster &ofCluster, QueriedGroups &toResult)
GroupSearchTree * get_clade()
ARB_ERROR set_marks_in_group(size_t idx, GroupMarkMode mode)
bool wordwise_name_matching() const
#define cmp(h1, h2)
Definition: admap.cxx:50
GroupClusterSet::const_iterator GroupClusterCIter
GBQUARK GB_get_quark(GBDATA *gbd)
Definition: arbdb.cxx:1703
void fix_deleted_groups(const GBDATAset &deleted_groups)
int GB_read_flag(GBDATA *gbd)
Definition: arbdb.cxx:2796
GBDATA * GBT_find_species_rel_species_data(GBDATA *gb_species_data, const char *name)
Definition: aditem.cxx:133
void forgetQExpressions()
AP_tree_nlen * rootNode()
Definition: ap_main.hxx:54
bool contains(const WordSet &ws, const string &w)
void remove_hit(size_t idx)
void track(int wName, int wReason, int nesting, int size, int marked, int clusID, double aid, bool keeled)
Candidate(GBDATA *gb_group_, GroupSearchTree *node_)
void remove(GroupSearch *gs)
RefPtr< GBDATA > tree
ARB_ERROR group_set_folded(GBDATA *gb_group, bool folded)
static int max2width(const int &i)
Definition: group_search.h:131
char * GS_calc_resulting_groupname(GBDATA *gb_main, const QueriedGroups &queried, int hit_idx, const char *input_name, const char *acisrt, ARB_ERROR &error)
Definition: lazy.h:26
static GBL_command_definition groupRename_command_table[]
void sort_by(const SortCriteria &by)
GB_CASE
Definition: arb_core.h:30
Candidate(const FoundGroup &group_, GroupSearchTree *node_)
void append(QueryExpr *&tail)
Definition: query_expr.cxx:46
#define is_equal_to(val)
Definition: test_unit.h:1025
void erase(int i)
double get_aid() const
Definition: group_search.h:216
void add_informed_group(const FoundGroup &group)
Definition: group_search.h:268
group_matcher none()
Definition: test_unit.h:1012
FoundGroupContainer::const_iterator FoundGroupCIter
Definition: group_search.h:234
TYPE get_type() const
Definition: probe_tree.h:64
GB_ERROR inc_and_error_if_aborted()
Definition: arb_progress.h:327
static GB_ERROR grl_aid(GBL_command_arguments *args)
#define TEST_EXPECTATION(EXPCTN)
Definition: test_unit.h:1048
SearchedTreeContainer::iterator SearchedTreeIter
const FoundGroup & get_group() const
TreeNode * dest() const
Definition: TreeNode.h:768
int get_edge_iteration_count() const
char * GBT_join_strings(const CharPtrArray &strings, char separator)
const GroupRename_callenv & custom_gr_env(GBL_command_arguments *args)
static GB_ERROR grl_groupsize(GBL_command_arguments *args)
const char * get_name() const OVERRIDE
bool is_edge_to_leaf() const
Definition: TreeNode.h:864
int get_cluster_id() const
Definition: group_search.h:194
void set_cluster_id(int id)
Definition: group_search.h:192
bool is_leaf() const
Definition: TreeNode.h:211
GB_ERROR GB_remove_hierarchy_callback(GBDATA *gb_main, const char *db_path, GB_CB_TYPE type, const DatabaseCallback &dbcb)
Definition: ad_cb.cxx:440
char * get_target_data(const QueryTarget &target, GB_ERROR &) const OVERRIDE
DupTreeCriterionType get_tree_type() const
xml element
GB_ERROR close(GB_ERROR error)
Definition: arbdbpp.cxx:35
void GB_write_flag(GBDATA *gbd, long flag)
Definition: arbdb.cxx:2773
int min_cluster_size() const
#define FORMAT_2_OUT(args, fmt, value)
Definition: gb_aci_impl.h:24
bool operator()(const FoundGroup &g)
GBDATA * get_tree_data() const
bool is_inferable() const
void flush_loaded_tree()
#define TEST_EXPECTATION__BROKEN(WANTED, GOT)
Definition: test_unit.h:1051
FoundGroupContainer::iterator FoundGroupIter
Definition: group_search.h:235
#define OVERRIDE
Definition: cxxforward.h:112
static void tree_node_deleted_cb(GBDATA *gb_node, GroupSearchCommon *common, GB_CB_TYPE cbtype)
void GB_touch(GBDATA *gbd)
Definition: arbdb.cxx:2802
#define gs_assert(cond)
Definition: group_search.h:48
bool needs_eval() const
Definition: lazy.h:37
GBQUARK GB_find_existing_quark(GBDATA *gbd, const char *key)
Definition: arbdb.cxx:1690
Clusterer(GBDATA *gb_main, SmartPtr< QueriedGroups > groups_, SmartPtr< DupCriteria > criteria_)
char * name
Definition: TreeNode.h:174
void nprintf(size_t maxlen, const char *templat,...) __ATTR__FORMAT_MEMBER(2)
Definition: arb_strbuf.cxx:29
int GB_read_byte(GBDATA *gbd)
Definition: arbdb.cxx:734
bool matches(const QueryTarget &target, std::string &hit_reason) const
Definition: query_expr.cxx:277
void forget_lookup() const
char * get_target_data(const QueryTarget &target, GB_ERROR &) const OVERRIDE
static GB_ERROR grl_markedingroup(GBL_command_arguments *args)
GBDATA * lookupParent(GBDATA *gb_child_group) const
char * get_target_data(const QueryTarget &target, GB_ERROR &) const OVERRIDE
GBDATA * get_ACI_item() const
char * GB_read_string(GBDATA *gbd)
Definition: arbdb.cxx:909
GB_ERROR GB_write_byte(GBDATA *gbd, int i)
Definition: arbdb.cxx:1238
int name_matches_wordwise(const GroupInfo &gi1, const GroupInfo &gi2) const
bool want_unique_groups() const
CriterionMatch
Definition: group_search.h:82
void GB_remove_callback(GBDATA *gbd, GB_CB_TYPE type, const DatabaseCallback &dbcb)
Definition: ad_cb.cxx:360
~ParentGroupNameQueryKey() OVERRIDE
FoundGroupCIter begin() const
Definition: group_search.h:275
GBDATA * GBT_first_species(GBDATA *gb_main)
Definition: aditem.cxx:124
void GBT_get_tree_names(ConstStrArray &names, GBDATA *gb_main, bool sorted)
Definition: adtree.cxx:1187
void GBT_message(GBDATA *gb_main, const char *msg)
Definition: adtools.cxx:238
query_operator
Definition: query_expr.h:64
std::list< GroupSortCriterion > SortCriteria
Definition: group_search.h:253
unsigned get_marked_count() const
void negate()
Definition: query_expr.cxx:56
const char * get_name() const
#define TEST_EXPECT_NO_ERROR(call)
Definition: test_unit.h:1118
const char * get_group_name() const
GBDATA * get_pointer() const
Definition: group_search.h:184
int get_keeledStateInfo() const
DECLARE_ASSIGNMENT_OPERATOR(GroupCluster)
const ColumnWidths & get_column_widths() const
char * get_target_data(const QueryTarget &target, GB_ERROR &) const OVERRIDE
const char * get_name() const OVERRIDE
CriterionType
Definition: group_search.h:86
DupNameCriterion(DupNameCriterionType wordwise, GB_CASE sens_, int min_words_, const WordSet &ignored_words_, const char *wordSeparators_)
void sort(CharPtrArray_compare_fun compare, void *client_data)
int get_marked() const
Definition: group_search.h:213
ARB_ERROR fold_group(size_t idx, GroupFoldingMode mode)
#define KEELED_INDICATOR
Definition: TreeNode.h:168
bool is_inner_edge() const
Definition: TreeNode.h:872
GBDATA * GBT_next_species(GBDATA *gb_species)
Definition: aditem.cxx:128
#define NULp
Definition: cxxforward.h:116
static const GBL_command_lookup_table & get_GroupRename_customized_ACI_commands()
void add(GroupSearch *gs)
size_t size() const
Definition: group_search.h:265
bool is_leaf() const
Definition: probe_tree.h:67
#define TEST_EXPECT_ERROR_CONTAINS(call, part)
Definition: test_unit.h:1114
vector< SearchedTree > SearchedTreeContainer
GroupFoldingMode
Definition: group_search.h:52
void add_candidate(const GroupSearch &group_search, Candidate &cand, const std::string &hit_reason)
const char * get_data() const
Definition: arb_strbuf.h:120
int get_min_wanted_words() const
RefPtr< GBDATA > gb_overlap_group
Definition: group_search.h:173
NOT4PERL char * GB_command_interpreter_in_env(const char *str, const char *commands, const GBL_call_env &callEnv)
Definition: gb_aci.cxx:361
char * get_target_data(const QueryTarget &target, GB_ERROR &) const OVERRIDE
void inc_by(PINT count)
Definition: arb_progress.h:361
GBDATA * GB_nextChild(GBDATA *child)
Definition: adquery.cxx:326
void notify_deleted(GBDATA *gb_node)
const char * get_name() const OVERRIDE
ParentCache & get_parent_cache()
TreeNode * keelTarget()
Definition: TreeNode.h:448
GBDATA * GBT_find_tree(GBDATA *gb_main, const char *tree_name)
Definition: adtree.cxx:993
GB_transaction ta(gb_var)
int calc_max_used_words(bool ignore_delivered)
const QueriedGroups & get_results()
void reset() const OVERRIDE
SymmetricMatrixMapper(int elements)
static void group_name_changed_cb(GBDATA *gb_group_name, GroupSearchCommon *common)
GB_CSTR GB_read_char_pntr(GBDATA *gbd)
Definition: arbdb.cxx:904
GBDATA * gb_node
Definition: TreeNode.h:173
GBDATA * gb_main
Definition: adname.cxx:32
TargetGroup(GBDATA *gb_main_, const char *treename_)
void forget_results()
Definition: group_search.h:394
ParentGroupNameQueryKey(const GroupSearch &group_search_, CriterionType ctype)
bool operator()(const FoundGroup &g1, const FoundGroup &g2) const
void defineParentOf(GBDATA *gb_child_group, GBDATA *gb_parent_group)
ARB_ERROR rename_found_groups(const char *acisrt)
GroupSearchCommon * common
GBDATA * GB_search(GBDATA *gbd, const char *fieldpath, GB_TYPES create)
Definition: adquery.cxx:531
char * get_target_data(const QueryTarget &target, GB_ERROR &) const OVERRIDE
void aimTo(const Candidate &c)
std::string hit_reason
Definition: group_search.h:165
GB_CSTR GBT_get_name_or_description(GBDATA *gb_item)
Definition: aditem.cxx:459
size_t length
GB_CB_TYPE
Definition: arbdb_base.h:46
ARB_ERROR change_folding(GroupFoldingMode mode)
int get_size() const
Definition: group_search.h:212
void perform_search(GroupSearchMode mode)
#define min(a, b)
Definition: f2c.h:153
char * get_target_data(const QueryTarget &target, GB_ERROR &) const OVERRIDE
static int info[maxsites+1]
CONSTEXPR_INLINE int double_cmp(const double d1, const double d2)
Definition: arbtools.h:185
void GBT_splitNdestroy_string(ConstStrArray &names, char *&namelist, const char *separator, SplitMode mode)
static GB_ERROR grl_hitidx(GBL_command_arguments *args)
const GroupSearchTree * get_clade() const
GroupMarkMode
Definition: group_search.h:66
void set_min_wanted_words(int words)
unsigned get_zombie_count() const
SearchedTree(const char *name_, GBDATA *gb_main)
const char * get_name() const OVERRIDE
GroupMarkedKey(bool percent_)
#define TEST_EXPECT_EQUAL(expr, want)
Definition: test_unit.h:1294
const GBL_command_lookup_table & ACI_get_standard_commands()
Definition: adlang1.cxx:2749
SmartPtr< WordSet > words
bool failed_to_load() const
bool is_normal_group() const
Definition: TreeNode.h:470
GBDATA * GB_entry(GBDATA *father, const char *key)
Definition: adquery.cxx:334
bool legal_hit_index() const
li
Definition: AW_awar.cxx:152
char * GBS_global_string_copy(const char *templat,...)
Definition: arb_msg.cxx:194
void GB_close(GBDATA *gbd)
Definition: arbdb.cxx:655
TreeNode * source() const
Definition: TreeNode.h:767
unsigned get_zombie_count() const
CriterionOperator
Definition: group_search.h:77
size_t size() const
int get_dupidx(GB_ERROR &error) const
void put(char c)
Definition: arb_strbuf.h:174
static int iteration_count(int leafs_in_tree)
Definition: TreeNode.h:850
Definition: cache.h:31
#define UNCOVERED()
Definition: arb_assert.h:380
Definition: arbdb.h:66
GBDATA * GBT_get_species_data(GBDATA *gb_main)
Definition: aditem.cxx:105
GB_write_int const char s
Definition: AW_awar.cxx:154
#define max(a, b)
Definition: f2c.h:154