ARB
NT_species_set.h
Go to the documentation of this file.
1 // =============================================================== //
2 // //
3 // File : NT_species_set.h //
4 // Purpose : //
5 // //
6 // Institute of Microbiology (Technical University Munich) //
7 // http://www.arb-home.de/ //
8 // //
9 // =============================================================== //
10 
11 #ifndef NT_SPECIES_SET_H
12 #define NT_SPECIES_SET_H
13 
14 #ifndef NT_TREE_CMP_H
15 #include "NT_tree_cmp.h"
16 #endif
17 #ifndef ARBTOOLS_H
18 #include <arbtools.h>
19 #endif
20 #ifndef AP_TREE_HXX
21 #include <AP_Tree.hxx>
22 #endif
23 
24 class RSpecSet;
25 class TSpecSet;
26 class arb_progress;
27 
28 // @@@ improve compare logic:
29 // - species sets (and bitstrings) should only contain species that occur in both trees
30 // - species that occur only in RSpecSet-tree shall be stored in RSpecSet (like done in TSpecSet::unfound_species_count)
31 // - a small penalty shall be assigned (as done for TSpecSet)
32 
33 class SpecSetRegistry : virtual Noncopyable {
34  long species_counter; // number of species added to hash
35  long nspecies;
36  long nsets; // number of RSpecSet added to 'sets'
37 
38  RSpecSet **sets;
39  int set_bits[256];
40 
41  GroupMatchScorer scorer;
42  arb_progress *progress;
43  GB_HASH *species_hash; // contains [1..N]
44  unsigned char *tmp_bitstring;
45 
46  int max_nsets() const { return leafs_2_innerNodes(nspecies, ROOTED); }
47 
48  void dump_bitstring(const char *tag, unsigned char *bs);
49 
50  void add(const char *species_name); // max nspecies
51  void add(RSpecSet *rset); // max 2 * nspecies
52 
53  double search_and_remember_best_match_and_log_errors(const TSpecSet *tset, FILE *log);
54 
55 #if defined(UNIT_TESTS)
56  friend void TEST_species_sets();
57 #endif
58 
59 public:
60  SpecSetRegistry(long nspecies_, arb_progress *progress_, const GroupMatchScorer& scorer_);
62  void finish(GB_ERROR& error); // call before destruction to retrieve errors
63 
64  long bitstring_bytes() const { return (nspecies-1)/8 + 1; }
65  long bitstring_longs() const { return (bitstring_bytes()-1)/sizeof(long) + 1; }
66 
67  unsigned char *allocate_bitstring() const { return ARB_calloc<unsigned char>(bitstring_longs()*sizeof(long)); }
68 
69  long get_species_index(const char *species_name) const { return GBS_read_hash(species_hash, species_name); }
71 
72  RSpecSet *search_best_match(const TSpecSet *tset, GroupPenalty& min_penalty);
73  TSpecSet *find_best_matches_info(AP_tree *node, FILE *log, bool compare_node_info);
74  GB_ERROR write_node_information(FILE *log, bool delete_old_nodes, GroupsToTransfer what, const char *aci);
75 
76  void setScorer(const GroupMatchScorer& newScorer) { scorer = newScorer; }
77 };
78 
79 
80 class SpecSet : virtual Noncopyable {
81 protected:
82  // SpecSet should only be used by derived classes
83 
84  int known_members; // number of registered members
85 
86  void init(AP_tree *nodei, const SpecSetRegistry& ssr);
87 
88  SpecSet(AP_tree *nodei, const SpecSetRegistry& ssr, const char *species_name); // create from species..
89  SpecSet(AP_tree *nodei, const SpecSetRegistry& ssr, const SpecSet *l, const SpecSet *r); // ..or from two subsets
90  ~SpecSet();
91 
92 public:
93  // @@@ make member private
94  unsigned char *bitstring;
95  AP_tree *set_node; // node in tree (from which SpecSet was initialized)
96 
97  bool is_leaf_set() const { return set_node && set_node->is_leaf(); } // @@@ might be wrong for zombies
98  int get_known_members() const { return known_members; }
99 };
100 
101 class RSpecSet : public SpecSet { // derived from Noncopyable
102  // set registered in SpecSetRegistry
103  AP_tree *best_node; // node in other tree
104  GroupPenalty best_match; // result of matching 'this' versus TSpecSet of best_node
105 
106 public:
107  RSpecSet(AP_tree *nodei, const SpecSetRegistry& ssr, const char *species_name); // create from species..
108  RSpecSet(AP_tree *nodei, const SpecSetRegistry& ssr, const RSpecSet *l, const RSpecSet *r); // ..or from two subsets
109 
110  void storeBetterMatch(const GroupPenalty& match, AP_tree *matched_node) {
111  // if 'this' was detected as best match for any TSpecSet of other (not registered) tree,
112  // -> store match in best_match + node of TSpecSet in best_node:
113 
114  nt_assert(!best_match.betterThan(match)); // avoid overwriting with worse match
115 
116  best_match = match;
117  best_node = matched_node;
118  }
119 
120  int size() const { return known_members; } // only contains known members by definition
121  const GroupPenalty& bestMatch() const { return best_match; }
122  AP_tree* matchedNode() const { return best_node; }
123 };
124 
125 class TSpecSet : public SpecSet { // derived from Noncopyable
126  // set tested against sets in registry
127 
128  int unfound_species_count; // species missing in SpecSetRegistry
129 public:
130  TSpecSet(AP_tree *nodei, const SpecSetRegistry& ssr, const char *species_name); // create from species..
131  TSpecSet(AP_tree *nodei, const SpecSetRegistry& ssr, const TSpecSet *l, const TSpecSet *r); // ..or from two subsets
132 
133  int size() const { return known_members + unfound_species_count; }
134  int get_unknown_members() const { return unfound_species_count; }
135 };
136 
137 #else
138 #error NT_species_set.h included twice
139 #endif // NT_SPECIES_SET_H
Definition: arbdbt.h:48
int size() const
const char * GB_ERROR
Definition: arb_core.h:25
long bitstring_bytes() const
AP_tree * matchedNode() const
long
Definition: AW_awar.cxx:152
void storeBetterMatch(const GroupPenalty &match, AP_tree *matched_node)
CONSTEXPR_INLINE int leafs_2_innerNodes(int leafs, TreeModel model)
Definition: arbdbt.h:70
bool betterThan(const GroupPenalty &other) const
Definition: NT_tree_cmp.h:89
AP_tree * set_node
void finish(GB_ERROR &error)
const GroupPenalty & bestMatch() const
long bitstring_longs() const
bool is_leaf_set() const
long get_species_index(const char *species_name) const
TSpecSet * find_best_matches_info(AP_tree *node, FILE *log, bool compare_node_info)
static void error(const char *msg)
Definition: mkptypes.cxx:96
RSpecSet * registerTree(AP_tree *node)
int get_unknown_members() const
GroupsToTransfer
Definition: NT_tree_cmp.h:27
int get_known_members() const
TSpecSet(AP_tree *nodei, const SpecSetRegistry &ssr, const char *species_name)
int size() const
#define nt_assert(cond)
Definition: NT_local.h:27
void setScorer(const GroupMatchScorer &newScorer)
bool is_leaf() const
Definition: TreeNode.h:211
int known_members
void init(AP_tree *nodei, const SpecSetRegistry &ssr)
RSpecSet * search_best_match(const TSpecSet *tset, GroupPenalty &min_penalty)
SpecSetRegistry(long nspecies_, arb_progress *progress_, const GroupMatchScorer &scorer_)
Definition: NT_tree_cmp.cxx:23
RSpecSet(AP_tree *nodei, const SpecSetRegistry &ssr, const char *species_name)
SpecSet(AP_tree *nodei, const SpecSetRegistry &ssr, const char *species_name)
GB_ERROR write_node_information(FILE *log, bool delete_old_nodes, GroupsToTransfer what, const char *aci)
unsigned char * allocate_bitstring() const
unsigned char * bitstring
long GBS_read_hash(const GB_HASH *hs, const char *key)
Definition: adhash.cxx:392