ARB
group_search.h
Go to the documentation of this file.
1 // ============================================================= //
2 // //
3 // File : group_search.h //
4 // Purpose : public interface of group search //
5 // //
6 // Coded by Ralf Westram (coder@reallysoft.de) in April 2017 //
7 // http://www.arb-home.de/ //
8 // //
9 // ============================================================= //
10 
11 #ifndef GROUP_SEARCH_H
12 #define GROUP_SEARCH_H
13 
14 #ifndef LAZY_H
15 #include <lazy.h>
16 #endif
17 #ifndef QUERY_EXPR_H
18 #include <query_expr.h>
19 #endif
20 #ifndef ARBDB_BASE_H
21 #include <arbdb_base.h>
22 #endif
23 #ifndef ARBTOOLS_H
24 #include <arbtools.h>
25 #endif
26 #ifndef ARB_ERROR_H
27 #include <arb_error.h>
28 #endif
29 #ifndef SMARTPTR_H
30 #include <smartptr.h>
31 #endif
32 #ifndef CB_H
33 #include <cb.h>
34 #endif
35 #ifndef _GLIBCXX_VECTOR
36 #include <vector>
37 #endif
38 #ifndef _GLIBCXX_SET
39 #include <set>
40 #endif
41 #ifndef _GLIBCXX_LIST
42 #include <list>
43 #endif
44 #ifndef _GLIBCXX_STRING
45 #include <string>
46 #endif
47 
48 #define gs_assert(cond) arb_assert(cond)
49 
50 class GroupSearchCommon;
51 
52 enum GroupFoldingMode { // bit mask
53  GFM_EXPAND = 1, // expand groups (otherwise collapse)
54  GFM_TOGGLE = 2, // toggle state of groups (if set -> bit0 is ignored)
55  GFM_RECURSE = 4, // apply to parents (use with GFM_EXPAND)
56  GFM_PARENTS_ONLY = 8, // only apply to parents, not to listed groups (use with GFM_EXPAND)
57  GFM_COLLAPSE_REST = 16, // collapse all other groups in each tree
58 
59  // convenience defs
64 };
65 
66 enum GroupMarkMode { // same values as ../TREEDISP/TreeCallbacks.cxx@MARK_MODE_LOWER_BITS
68  GMM_MARK = 1,
70 };
71 
73 
74 // ----------------------------------------
75 // query expression related types
76 
77 enum CriterionOperator { // @@@ rename
81 };
82 enum CriterionMatch { // @@@ rename
85 };
86 enum CriterionType { // @@@ rename
87  CT_NAME, // groupname (STRING)
88  CT_PARENT_DIRECT, // groupname of direct parent (STRING)
89  CT_PARENT_ANY, // groupname of any parent (STRING)
90  CT_PARENT_ALL, // groupname of all parents (STRING)
91  CT_NESTING_LEVEL, // group-nesting-level (INT)
92  CT_FOLDED, // folded (INT)
93  CT_SIZE, // size of group = number of species (INT)
94  CT_MARKED, // number of marked species (INT)
95  CT_MARKED_PC, // percentage of marked species (FLOAT)
96  CT_ZOMBIES, // number of zombies (INT)
97  CT_AID, // average ingroup distance
98  CT_KEELED, // keeled state (INT)
99 };
100 
101 // ------------------------
102 // search results
103 
104 struct ColumnWidths {
105  // Stores widths (or maxvalues for integers) of all
106  // FoundGroup-attributes that might get displayed in result list.
107 
108  int name; // max. width of name
109  int reason; // max. width of hit_reason
110 
111  int max_nesting; // max. value for nesting
112  int max_size; // max. value for size
113  int max_marked; // max. value for marked
114  int max_marked_pc; // max. value for marked%
115  int max_cluster_id; // max. value of cluster id
116  int max_aid; // max. value in front of dot
117 
118  bool seen_keeled; // any keeled group found?
119 
121  name(0),
122  reason(0),
123  max_nesting(0),
124  max_size(0),
125  max_marked(0),
126  max_marked_pc(0),
127  max_cluster_id(0),
128  seen_keeled(false)
129  {}
130 
131  static int max2width(const int& i) { // @@@ a bit expensive (better use specialized type for 'max_nesting' and other maximas)
132  // calculate column width from maximum
133  // (Note: fails if negative values are involved)
134  return strlen(GBS_global_string("%i", i));
135  }
136  static int percent(int part, int all) {
137  return int(part*100.0/all+0.5);
138  }
139 
140  void track(int wName, int wReason, int nesting, int size, int marked, int clusID, double aid, bool keeled);
141 };
142 
143 class QueriedGroups;
144 
145 class FoundGroup {
146  // stores/exports info about a group (as found by group search)
147  // - usable w/o tree loaded
148  // - (able to) know details (like nesting, number of marked species, ...)
149 
150  RefPtr<GBDATA> gb_group;
151 
152  ARB_ERROR set_folded(bool folded);
153  ARB_ERROR set_overlap_folded(bool folded);
154  int get_name_length() const;
155 
156  GBDATA *get_overlap_group() const {
157  gs_assert(keeled.has_value()); // means: pointer 'gb_overlap_group' is valid
158  return gb_overlap_group;
159  }
160 
161 protected:
162  // optional values (needed to be "informed")
163  // - set manually by derivate ('Candidate' in group_search.cxx@inform_group)
164  // - knows_details() returns true if everything is properly set
167  Lazy<int, -1> size;
168  Lazy<int, -1> marked;
169 
170  LazyFloat<double> aid; // average ingroup distance
171 
172  Lazy<int, -1> keeled;
173  RefPtr<GBDATA> gb_overlap_group; // valid if 'keeled.has_value()'; NULp -> no overlapping group
174 
176 
177 public:
178  explicit FoundGroup(GBDATA *gb_group_) :
179  gb_group(gb_group_),
180  gb_overlap_group(NULp),
181  clusterID(0)
182  {}
183 
184  GBDATA *get_pointer() const { return gb_group; }
185  GBDATA *get_tree_data() const;
186 
187  const char *get_name() const;
188  const char *get_tree_name() const;
189  int get_tree_order() const;
190  const std::string& get_hit_reason() const { return hit_reason; }
191 
192  void set_cluster_id(int id) { gs_assert(id>0); clusterID = id; }
193  void forget_cluster_id() { clusterID = 0; }
194  int get_cluster_id() const { return clusterID; }
195 
196  bool overlap_is_folded() const;
197  bool is_folded() const;
198 
199  bool knows_details() const { // returns true if group is "informed"
200  return
201  !hit_reason.empty() &&
202  nesting.has_value() &&
203  size.has_value() &&
204  marked.has_value() &&
205  keeled.has_value() &&
206  aid.has_value();
207  }
208  void track_max_widths(ColumnWidths& widths) const;
209 
210  // getters for "informed" groups:
211  int get_nesting() const { gs_assert(knows_details()); return nesting; }
212  int get_size() const { gs_assert(knows_details()); return size; }
213  int get_marked() const { gs_assert(knows_details()); return marked; }
214  int get_marked_pc() const { gs_assert(knows_details()); return ColumnWidths::percent(marked, size); }
215  int get_keeled() const { gs_assert(knows_details()); return keeled; }
216  double get_aid() const { gs_assert(knows_details()); return aid; }
217 
218  bool operator == (const FoundGroup& other) const {
219  return gb_group == other.gb_group;
220  }
221 
222  // modify DB:
224  ARB_ERROR rename_by_ACI(const char *acisrt, const QueriedGroups& results, int hit_idx);
226 };
227 
228 class Candidate; // a derivate of FoundGroup
230 
231 typedef std::set<GBDATA*> GBDATAset;
232 typedef std::set<std::string> SpeciesNames;
233 typedef std::vector<FoundGroup> FoundGroupContainer;
234 typedef FoundGroupContainer::const_iterator FoundGroupCIter;
235 typedef FoundGroupContainer::iterator FoundGroupIter;
236 
238  GSC_NONE, // special criterion (not allowed in SortCriteria)
239  GSC_REVERSE, // special criterion: modifies previous criterion; may occur multiple times
242  GSC_TREEORDER, // as defined in tree-admin
249  GSC_AID, // average ingroup distance
251 };
252 
253 typedef std::list<GroupSortCriterion> SortCriteria;
254 
256  FoundGroupContainer found;
257  RefPtr<const SortCriteria> sorted_by;
258 
259  mutable SmartPtr<ColumnWidths> widths;
260  void invalidate_widths() const { widths.setNull(); }
261 
262 public:
263  QueriedGroups() : sorted_by(NULp) {}
264 
265  size_t size() const { return found.size(); }
266  bool empty() const { return size() == 0; }
267 
269  gs_assert(group.knows_details());
270  found.push_back(group);
271  invalidate_widths();
272  }
273  void add_candidate(const GroupSearch& group_search, Candidate& cand, const std::string& hit_reason);
274 
275  FoundGroupCIter begin() const { return found.begin(); }
276  FoundGroupIter begin() { return found.begin(); }
277  FoundGroupCIter end() const { return found.end(); }
278  FoundGroupIter end() { return found.end(); }
279 
280  const FoundGroup& operator[](size_t idx) const { gs_assert(idx<size()); return found.at(idx); }
281  FoundGroup& operator[](size_t idx) { gs_assert(idx<size()); return found.at(idx); }
282 
283  bool erase_deleted(GroupSearchCommon *common);
284  bool contains_changed(GroupSearchCommon *common) const;
285 
286  const ColumnWidths& get_column_widths() const;
287  const char *get_group_display(const FoundGroup& g, bool show_tree_name) const;
288 
289  void sort_by(const SortCriteria& by);
290 
291  void remove_hit(size_t idx);
292 };
293 
294 // --------------------------
295 // DupCriteria
296 
298  DNC_WHOLENAME, // match of whole groupname
299  DNC_WORDWISE, // wordwise match
300 
301  // maybe add something more fuzzy?
302 };
304  DLC_SAME_TREE, // duplicates need to be in same tree
305  DLC_DIFF_TREE, // duplicates need to be in different trees (but: cluster will be extended with groups from same tree)
306  DLC_ANYWHERE, // no restriction
307 };
308 
309 typedef std::set<std::string> WordSet;
310 
311 class DupCriteria;
312 
313 // ---------------------
314 // GroupSearch
315 
316 DECLARE_CBTYPE_FVV_AND_BUILDERS(GroupSearchCallback, void, class GroupSearch*); // generates makeGroupSearchCallback
317 
318 typedef std::set<std::string> TreeNameSet;
319 
320 enum GroupSearchMode { // bit mask
321  GSM_FORGET_EXISTING = 1, // otherwise keep existing results
322  GSM_ADD = 2, // otherwise remove
323  GSM_MISMATCH = 4, // otherwise target matching
324 
325  // convenience defs
327  GSM_KEEP = 0,
328  GSM_REMOVE = GSM_KEEP|GSM_MISMATCH, // has to be XORed
330 };
331 
332 class GroupSearch : virtual Noncopyable {
333  GBDATA *gb_main;
334  SmartPtr<QueryExpr> query_expr;
335  TreeNameSet trees_to_search;
337  GroupSearchCallback redisplay_cb;
338 
339  SortCriteria order;
340  bool sortedByOrder;
341 
343 
344  static GroupSearchCommon *common;
345 
346  void sort_results();
347  GB_ERROR clusterDuplicates();
348 
349 #if defined(UNIT_TESTS)
350 public:
351 #endif
352  ARB_ERROR collectSpecies(const QueriedGroups& groups, CollectMode cmode, SpeciesNames& species);
353 public:
354  GroupSearch(GBDATA *gb_main_, const GroupSearchCallback& redisplay_results_cb);
355  ~GroupSearch();
356 
357  GBDATA *get_gb_main() const { return gb_main; }
358 
359  // -------------------------
360  // define search:
361 
362  void addQueryExpression(CriterionOperator op, CriterionType type, CriterionMatch mtype, const char *expression);
363  void forgetQExpressions();
364 
367  order.clear();
368  sortedByOrder = false;
369  }
370 
371  void setDupCriteria(bool listDups, DupNameCriterionType ntype, GB_CASE sens, DupTreeCriterionType ttype, int min_cluster_size);
372  void setDupCriteria(bool listDups, DupNameCriterionType ntype, GB_CASE sens, int min_words, const WordSet& ignored_words, const char *wordSeparators, DupTreeCriterionType ttype, int min_cluster_size);
373  void setDupCriteria(bool listDups, DupNameCriterionType ntype, GB_CASE sens, int min_words, const char *ignored_words, const char *wordSeparators, DupTreeCriterionType ttype, int min_cluster_size);
374  void forgetDupCriteria();
375 
376  void setSearchRange(const TreeNameSet& trees_to_search_) { // empty -> search all trees
377  trees_to_search = trees_to_search_;
378  }
379 
380  // ----------------
381  // search
382 
383  void perform_search(GroupSearchMode mode);
384 
385  GBDATA *get_parent_group(GBDATA *gb_group) const;
386  int calc_nesting_level(GBDATA *gb_group) const;
387 
388  // ------------------------
389  // handle results:
390 
391  const QueriedGroups& get_results();
392  bool has_results() const { return found.isSet() && !found->empty(); }
393  void remove_hit(size_t idx) { found->remove_hit(idx); }
394  void forget_results() { found = new QueriedGroups; }
395 
396  // ----------------------------
397  // actions on results:
398 
399  GB_ERROR delete_group(size_t idx);
401 
402  ARB_ERROR rename_group(size_t idx, const char *acisrt);
403  ARB_ERROR rename_found_groups(const char *acisrt);
404 
405  ARB_ERROR fold_group(size_t idx, GroupFoldingMode mode);
407 
408  ARB_ERROR set_marks_in_group(size_t idx, GroupMarkMode mode);
410 
411 #if defined(UNIT_TESTS)
412  // inspectors:
413  const SortCriteria& inspect_order() const { return order; }
414  static GroupSearchCommon *get_common() { return common; }
415 #endif
416 
417  // internal (dont use):
419 };
420 
421 char *GS_calc_resulting_groupname(GBDATA *gb_main, const QueriedGroups& queried, int hit_idx, const char *input_name, const char *acisrt, ARB_ERROR& error);
422 
423 
424 #else
425 #error group_search.h included twice
426 #endif // GROUP_SEARCH_H
DupTreeCriterionType
Definition: group_search.h:303
const char * get_tree_name() const
const char * GB_ERROR
Definition: arb_core.h:25
GB_TYPES type
const std::string & get_hit_reason() const
Definition: group_search.h:190
const char * get_group_display(const FoundGroup &g, bool show_tree_name) const
const char * id
Definition: AliAdmin.cxx:17
void forgetSortCriteria()
Definition: group_search.h:366
std::set< std::string > SpeciesNames
Definition: group_search.h:232
group_matcher all()
Definition: test_unit.h:1000
int get_marked_pc() const
Definition: group_search.h:214
GBDATA * get_parent_group(GBDATA *gb_group) const
Lazy< int,-1 > nesting
Definition: group_search.h:166
return string(buffer, length)
bool overlap_is_folded() const
GB_ERROR delete_group(size_t idx)
bool empty() const
Definition: group_search.h:266
void addSortCriterion(GroupSortCriterion gsc)
GB_ERROR delete_from_DB()
void setDupCriteria(bool listDups, DupNameCriterionType ntype, GB_CASE sens, DupTreeCriterionType ttype, int min_cluster_size)
int get_tree_order() const
GB_ERROR delete_found_groups()
bool contains_changed(GroupSearchCommon *common) const
ARB_ERROR set_marks_in_found_groups(GroupMarkMode mode, CollectMode cmode)
int calc_nesting_level(GBDATA *gb_group) const
const char * GBS_global_string(const char *templat,...)
Definition: arb_msg.cxx:204
void setSearchRange(const TreeNameSet &trees_to_search_)
Definition: group_search.h:376
bool is_folded() const
GroupSortCriterion
Definition: group_search.h:237
void refresh_results_after_DBchanges()
CollectMode
Definition: group_search.h:72
void setNull()
set SmartPtr to NULp
Definition: smartptr.h:251
Lazy< int,-1 > keeled
Definition: group_search.h:172
GBDATA * get_gb_main() const
Definition: group_search.h:357
std::set< std::string > WordSet
Definition: group_search.h:309
ARB_ERROR fold_found_groups(GroupFoldingMode mode)
FoundGroup(GBDATA *gb_group_)
Definition: group_search.h:178
Lazy< int,-1 > marked
Definition: group_search.h:168
Lazy< int,-1 > size
Definition: group_search.h:167
ARB_ERROR rename_by_ACI(const char *acisrt, const QueriedGroups &results, int hit_idx)
GroupSearchMode
Definition: group_search.h:320
const FoundGroup & operator[](size_t idx) const
Definition: group_search.h:280
std::set< GBDATA * > GBDATAset
Definition: group_search.h:229
bool erase_deleted(GroupSearchCommon *common)
bool isSet() const
test if SmartPtr is not NULp
Definition: smartptr.h:245
void remove_hit(size_t idx)
Definition: group_search.h:393
char * GS_calc_resulting_groupname(GBDATA *gb_main, const QueriedGroups &queried, int hit_idx, const char *input_name, const char *acisrt, ARB_ERROR &error)
bool knows_details() const
Definition: group_search.h:199
void addQueryExpression(CriterionOperator op, CriterionType type, CriterionMatch mtype, const char *expression)
FoundGroupIter begin()
Definition: group_search.h:276
static int group[MAXN+1]
Definition: ClustalV.cxx:65
#define false
Definition: ureadseq.h:13
DupNameCriterionType
Definition: group_search.h:297
void forgetDupCriteria()
bool has_results() const
Definition: group_search.h:392
LazyFloat< double > aid
Definition: group_search.h:170
static void error(const char *msg)
Definition: mkptypes.cxx:96
std::set< std::string > TreeNameSet
Definition: group_search.h:318
GroupSearch(GBDATA *gb_main_, const GroupSearchCallback &redisplay_results_cb)
ARB_ERROR rename_group(size_t idx, const char *acisrt)
int get_nesting() const
Definition: group_search.h:211
void track_max_widths(ColumnWidths &widths) const
int get_keeled() const
Definition: group_search.h:215
FoundGroupCIter end() const
Definition: group_search.h:277
ARB_ERROR set_marks_in_group(size_t idx, GroupMarkMode mode)
void forgetQExpressions()
void remove_hit(size_t idx)
void track(int wName, int wReason, int nesting, int size, int marked, int clusID, double aid, bool keeled)
bool operator==(const FoundGroup &other) const
Definition: group_search.h:218
static int max2width(const int &i)
Definition: group_search.h:131
Definition: lazy.h:26
void sort_by(const SortCriteria &by)
bool has_value() const
Definition: lazy.h:38
GB_CASE
Definition: arb_core.h:30
double get_aid() const
Definition: group_search.h:216
void add_informed_group(const FoundGroup &group)
Definition: group_search.h:268
FoundGroupContainer::const_iterator FoundGroupCIter
Definition: group_search.h:234
int get_cluster_id() const
Definition: group_search.h:194
void set_cluster_id(int id)
Definition: group_search.h:192
GBDATA * get_tree_data() const
FoundGroupContainer::iterator FoundGroupIter
Definition: group_search.h:235
DECLARE_CBTYPE_FVV_AND_BUILDERS(GroupSearchCallback, void, class GroupSearch *)
#define gs_assert(cond)
Definition: group_search.h:48
CriterionMatch
Definition: group_search.h:82
FoundGroupCIter begin() const
Definition: group_search.h:275
std::list< GroupSortCriterion > SortCriteria
Definition: group_search.h:253
const char * get_name() const
GBDATA * get_pointer() const
Definition: group_search.h:184
const ColumnWidths & get_column_widths() const
bool has_value() const
Definition: lazy.h:62
CriterionType
Definition: group_search.h:86
int get_marked() const
Definition: group_search.h:213
ARB_ERROR fold_group(size_t idx, GroupFoldingMode mode)
#define NULp
Definition: cxxforward.h:97
size_t size() const
Definition: group_search.h:265
FoundGroup & operator[](size_t idx)
Definition: group_search.h:281
GroupFoldingMode
Definition: group_search.h:52
void add_candidate(const GroupSearch &group_search, Candidate &cand, const std::string &hit_reason)
RefPtr< GBDATA > gb_overlap_group
Definition: group_search.h:173
void forget_cluster_id()
Definition: group_search.h:193
const QueriedGroups & get_results()
GBDATA * gb_main
Definition: adname.cxx:33
void forget_results()
Definition: group_search.h:394
ARB_ERROR rename_found_groups(const char *acisrt)
std::string hit_reason
Definition: group_search.h:165
ARB_ERROR change_folding(GroupFoldingMode mode)
int get_size() const
Definition: group_search.h:212
void perform_search(GroupSearchMode mode)
static int percent(int part, int all)
Definition: group_search.h:136
GroupMarkMode
Definition: group_search.h:66
std::vector< FoundGroup > FoundGroupContainer
Definition: group_search.h:233
FoundGroupIter end()
Definition: group_search.h:278
CriterionOperator
Definition: group_search.h:77