ARB
probe.h
Go to the documentation of this file.
1 #ifndef PROBE_H
2 #define PROBE_H
3 
4 #ifndef __LIST__
5 #include <list>
6 #endif
7 #ifndef __SET__
8 #include <set>
9 #endif
10 
11 #ifndef ARBDB_H
12 #include <arbdb.h>
13 #endif
14 #ifndef PT_COM_H
15 #include <PT_com.h>
16 #endif
17 #ifndef AISC_GEN_SERVER_INCLUDED
18 #include <PT_server.h>
19 #endif
20 #ifndef PT_TOOLS_H
21 #include "PT_tools.h"
22 #endif
23 #ifndef CACHE_H
24 #include <cache.h>
25 #endif
26 
27 #define PT_SERVER_MAGIC 0x32108765 // pt server identifier
28 #define PT_SERVER_VERSION 3 // version of pt server database (no versioning till 2009/05/13)
29 
30 #if defined(DEBUG)
31 // # define PTM_DEBUG_NODES
32 # define PTM_DEBUG_STAGE_ASSERTIONS
33 // # define PTM_DEBUG_VALIDATE_CHAINS
34 #endif // DEBUG
35 
36 #define CALCULATE_STATS_ON_QUERY
37 
38 #if defined(PTM_DEBUG_STAGE_ASSERTIONS)
39 #define pt_assert_stage(s) pt_assert(psg.get_stage() == (s))
40 #else // !defined(PTM_DEBUG_STAGE_ASSERTIONS)
41 #define pt_assert_stage(s)
42 #endif
43 
44 #if defined(PTM_DEBUG_VALIDATE_CHAINS)
45 #define pt_assert_valid_chain_stage1(node) pt_assert(PT_chain_has_valid_entries<ChainIteratorStage1>(node))
46 #else // !defined(PTM_DEBUG_VALIDATE_CHAINS)
47 #define pt_assert_valid_chain_stage1(node)
48 #endif
49 
50 typedef unsigned long ULONG;
51 typedef unsigned int UINT;
52 typedef unsigned char uchar;
53 
54 #define PT_MAX_PARTITION_DEPTH 4
55 
56 #define PT_POS_TREE_HEIGHT 20
57 #define PT_MIN_TREE_HEIGHT PT_MAX_PARTITION_DEPTH
58 
59 #define MIN_PROBE_LENGTH 2
60 
65 };
66 
67 
68 
69 #define MATCHANSWER 50 // private msg type: match result answer
70 #define CREATEANSWER 51 // private msg type: create result answer
71 #define FINDANSWER 52 // private msg type: find result answer
72 
73 extern int gene_flag; // if 'gene_flag' == 1 -> we are a gene pt server
74 struct Hs_struct;
75 
76 enum type_types {
77  t_int = 1,
78  t_string = 0,
79  t_float = 2
80 };
81 
82 enum PT_base {
83  PT_QU = 0,
84  PT_N = 1,
91 };
92 
93 CONSTEXPR_INLINE bool is_std_base (char b) { return b >= PT_A && b <= PT_T; }
94 CONSTEXPR_INLINE bool is_std_base_or_N(char b) { return b >= PT_N && b <= PT_T; }
95 CONSTEXPR_INLINE bool is_ambig_base (char b) { return b == PT_QU || b == PT_N; }
96 CONSTEXPR_INLINE bool is_valid_base (char b) { return b >= PT_QU && b < PT_BASES; }
97 
99  return base<PT_BASES ? ".NACGU"[safeCharIndex(base)] : base;
100 }
101 
102 inline char *probe_2_readable(char *id_string, int len) {
104  // caution if 'id_string' contains PT_QU ( == zero == EOS).
105  // (see also: probe_compress_sequence)
106  for (int i = 0; i<len; ++i) {
107  id_string[i] = base_2_readable(id_string[i]);
108  }
109  return id_string;
110 }
111 
112 inline void reverse_probe(char *seq, int len) {
113  int i = 0;
114  int j = len-1;
115 
116  while (i<j) std::swap(seq[i++], seq[j--]);
117 }
118 
119 struct POS_TREE1;
120 struct POS_TREE2;
121 
122 enum Stage { STAGE1, STAGE2 };
123 
124 // ---------------------
125 // Probe search
126 
127 class probe_input_data : virtual Noncopyable { // every taxa's own data
128  int size; // @@@ misleading (contains 1 if no bases in sequence)
129  GBDATA *gb_species;
130  bool group; // probe_design: whether species is in group
131 
132  typedef SmartArrayPtr(int) SmartIntPtr;
133 
135  mutable cache::CacheHandle<SmartIntPtr> rel2abs;
136 
137  static cache::Cache<SmartCharPtr> seq_cache;
138  static cache::Cache<SmartIntPtr> rel2abs_cache;
139 
140  SmartCharPtr loadSeq() const {
141  GB_transaction ta(gb_species);
142  GBDATA *gb_compr = GB_entry(gb_species, "compr");
143  return SmartCharPtr(GB_read_bytes(gb_compr));
144  }
145 
146 public:
147 
149  : size(0),
150  gb_species(NULp),
151  group(false)
152  {}
154  seq.release(seq_cache);
155  rel2abs.release(rel2abs_cache);
156  }
157 
158  static void set_cache_sizes(size_t seq_cache_size, size_t abs_cache_size) {
159  seq_cache.resize(seq_cache_size);
160  rel2abs_cache.resize(abs_cache_size);
161  }
162 
163  GB_ERROR init(GBDATA *gb_species_);
164 
165  SmartCharPtr get_dataPtr() const {
166  if (!seq.is_cached()) seq.assign(loadSeq(), seq_cache);
167  return seq.access(seq_cache);
168  }
169 
170  const char *get_shortname() const {
171  GB_transaction ta(gb_species);
172 
173  GBDATA *gb_name = GB_entry(gb_species, "name");
174  pt_assert(gb_name);
175  return GB_read_char_pntr(gb_name);
176  }
177  const char *get_fullname() const {
178  GB_transaction ta(gb_species);
179 
180  GBDATA *gb_full = GB_entry(gb_species, "full_name");
181  return gb_full ? GB_read_char_pntr(gb_full) : "";
182  }
183 
184  const char *get_acc() const {
185  GB_transaction ta(gb_species);
186 
187  GBDATA *gb_acc = GB_entry(gb_species, "acc");
188  pt_assert(gb_acc);
189  return gb_acc ? GB_read_char_pntr(gb_acc) : NULp;
190  }
191  int get_start() const {
192  GB_transaction ta(gb_species);
193 
194  GBDATA *gb_start = GB_entry(gb_species, "start");
195  return gb_start ? GB_read_int(gb_start) : 0;
196  }
197  int get_stop() const {
198  GB_transaction ta(gb_species);
199 
200  GBDATA *gb_stop = GB_entry(gb_species, "stop");
201  return gb_stop ? GB_read_int(gb_stop) : 0;
202  }
203 
204  long get_checksum() const { // @@@ change return-type -> uint32_t
205  GB_transaction ta(gb_species);
206  GBDATA *gb_cs = GB_entry(gb_species, "cs");
207  pt_assert(gb_cs);
208  uint32_t csum = uint32_t(GB_read_int(gb_cs));
209  return csum;
210  }
211  int get_size() const { return size; }
212 
213  bool valid_rel_pos(int rel_pos) const { // returns true if rel_pos is inside sequence
214  return rel_pos >= 0 && rel_pos<get_size();
215  }
216 
217  bool inside_group() const { return group; }
218  bool outside_group() const { return !group; }
219 
220  void set_group_state(bool isGroupMember) { group = isGroupMember; }
221 
222  long get_geneabspos() const {
223  pt_assert(gene_flag); // only legal in gene-ptserver
224  GBDATA *gb_pos = GB_entry(gb_species, "abspos");
225  if (gb_pos) return GB_read_int(gb_pos);
226  return -1;
227  }
228 
229  void preload_rel2abs() const {
230  if (!rel2abs.is_cached()) {
231  GB_transaction ta(gb_species);
232 
233  GBDATA *gb_baseoff = GB_entry(gb_species, "baseoff");
234  pt_assert(gb_baseoff);
235 
236  const GB_CUINT4 *baseoff = GB_read_ints_pntr(gb_baseoff);
237 
238  int *r2a = new int[size];
239 
240  int abs = 0;
241  for (int rel = 0; rel<size; ++rel) {
242  abs += baseoff[rel];
243  r2a[rel] = abs;
244  }
245 
246  rel2abs.assign(r2a, rel2abs_cache);
247  }
248  }
249  size_t get_abspos(size_t rel_pos) const {
250  pt_assert(rel2abs.is_cached()); // you need to call preload_rel2abs
251  return (&*rel2abs.access(rel2abs_cache))[rel_pos]; // @@@ brute-forced
252  }
253 
254  size_t calc_relpos(int abs_pos) const { // expensive
255  preload_rel2abs();
256  SmartIntPtr rel2abs_ptr = rel2abs.access(rel2abs_cache);
257  const int *r2a = &*rel2abs_ptr;
258 
259  int l = 0;
260  int h = get_size()-1;
261 
262  if (r2a[l] == abs_pos) return l;
263  if (r2a[h] == abs_pos) return h;
264 
265  while (l<h) {
266  int m = (l+h)/2;
267  if (r2a[m]<abs_pos) {
268  l = m;
269  }
270  else if (r2a[m]>abs_pos) {
271  h = m;
272  }
273  else {
274  return m;
275  }
276  }
277  return l;
278  }
279 };
280 
282  // common
283  int cut_offs; // statistic of chains
287  int longs;
288  int shorts;
289  int shorts2;
290  int chars;
291 
292 #ifdef ARB_64
293  // 64bit specials
294  int int_node;
295  int ints;
296  int ints2;
297  long maxdiff;
298 #endif
299 
300  void setup();
301 };
302 
303 class BI_ecoli_ref;
304 
305 class MostUsedPos : virtual Noncopyable {
306  int pos;
307  int used;
308 
309  MostUsedPos *next;
310 
311  void swapWith(MostUsedPos *other) {
312  std::swap(pos, other->pos);
313  std::swap(used, other->used);
314  }
315 
316 public:
317  MostUsedPos() : pos(0), used(0), next(NULp) { }
318  MostUsedPos(int p) : pos(p), used(1), next(NULp) { }
319  ~MostUsedPos() { delete next; }
320 
321  void clear() {
322  pos = 0;
323  used = 0;
324  delete next;
325  next = NULp;
326  }
327 
328  void announce(int p) {
329  if (p == pos) used++;
330  else {
331  if (next) next->announce(p);
332  else next = new MostUsedPos(p);
333  if (next->used>used) swapWith(next);
334  }
335  }
336 
337 
338  int get_most_used() const { return pos; }
339 };
340 
342  Stage stage;
343 
344  union {
347  } pt;
348 
349 public:
351  GBDATA *gb_main; // ARBDB interface
353  GB_HASH *namehash; // name to int
354 
356  struct probe_input_data *data; // the internal database
357 
358  char *ecoli; // the ecoli sequenz
360 
361  int max_size; // maximum sequence len
362  long char_count; // number of all 'acgtuACGTU'
363 
364  int reversed; // tell the matcher whether probe is reversed
365 
366  double *pos_to_weight; // position to weight
367 
369 
370  int sort_by;
371 
372  char *main_probe;
373 
374  char *server_name; // name of this server
376  T_PT_MAIN main;
377  Hs_struct *com_so; // the communication socket
378 
380 
381  bool big_db; // STAGE2 only (true -> uses 8 bit pointers)
382 
383  void setup();
384  void cleanup();
385 
386  void enter_stage(Stage stage_);
387  Stage get_stage() const { return stage; }
388 
389  POS_TREE1*& TREE_ROOT1() { pt_assert(stage == STAGE1); return pt.p1; }
390  POS_TREE2*& TREE_ROOT2() { pt_assert(stage == STAGE2); return pt.p2; }
391 };
392 
393 extern probe_struct_global psg;
394 
395 class gene_struct {
396  char *gene_name;
397  const char *arb_species_name; // pointers into 'gene_name'
398  const char *arb_gene_name;
399 
400  void init(const char *gene_name_, const char *arb_species_name_, const char *arb_gene_name_) {
401  int gene_name_len = strlen(gene_name_);
402  int arb_species_name_len = strlen(arb_species_name_);
403  int arb_gene_name_len = strlen(arb_gene_name_);
404 
405  int fulllen = gene_name_len+1+arb_species_name_len+1+arb_gene_name_len+1;
406  gene_name = new char[fulllen];
407  strcpy(gene_name, gene_name_);
408  arb_species_name = gene_name+(gene_name_len+1);
409  strcpy((char*)arb_species_name, arb_species_name_);
410  arb_gene_name = arb_species_name+(arb_species_name_len+1);
411  strcpy((char*)arb_gene_name, arb_gene_name_);
412  }
413 
414 public:
415  gene_struct(const char *gene_name_, const char *arb_species_name_, const char *arb_gene_name_) {
416  init(gene_name_, arb_species_name_, arb_gene_name_);
417  }
418  gene_struct(const gene_struct& other) {
419  if (&other != this) {
420  init(other.get_internal_gene_name(), other.get_arb_species_name(), other.get_arb_gene_name());
421  }
422  }
424  if (&other != this) {
425  delete [] gene_name;
426  init(other.get_internal_gene_name(), other.get_arb_species_name(), other.get_arb_gene_name());
427  }
428  return *this;
429  }
430 
432  delete [] gene_name;
433  }
434 
435  const char *get_internal_gene_name() const { return gene_name; }
436  const char *get_arb_species_name() const { return arb_species_name; }
437  const char *get_arb_gene_name() const { return arb_gene_name; }
438 };
439 
440 struct ltByArbName {
441  bool operator()(const gene_struct *gs1, const gene_struct *gs2) const {
442  int cmp = strcmp(gs1->get_arb_species_name(), gs2->get_arb_species_name());
443  if (cmp == 0) { cmp = strcmp(gs1->get_arb_gene_name(), gs2->get_arb_gene_name()); }
444  return cmp<0;
445  }
446 };
448  bool operator()(const gene_struct *gs1, const gene_struct *gs2) const {
449  int cmp = strcmp(gs1->get_internal_gene_name(), gs2->get_internal_gene_name());
450  return cmp<0;
451  }
452 };
453 
454 typedef std::list<gene_struct> gene_struct_list;
455 typedef std::set<const gene_struct *, ltByInternalName> gene_struct_index_internal;
456 typedef std::set<const gene_struct *, ltByArbName> gene_struct_index_arb;
457 
458 extern gene_struct_index_arb gene_struct_arb2internal; // sorted by arb species+gene name
459 extern gene_struct_index_internal gene_struct_internal2arb; // sorted by internal name
460 
461 #else
462 #error probe.h included twice
463 #endif
464 
struct probe_input_data * data
Definition: probe.h:356
char * alignment_name
Definition: probe.h:352
const char * GB_ERROR
Definition: arb_core.h:25
Definition: probe.h:122
int get_stop() const
Definition: probe.h:197
bool operator()(const gene_struct *gs1, const gene_struct *gs2) const
Definition: probe.h:448
gene_struct(const gene_struct &other)
Definition: probe.h:418
Stage get_stage() const
Definition: probe.h:387
Definition: probe.h:83
SmartCharPtr get_dataPtr() const
Definition: probe.h:165
long GB_read_int(GBDATA *gbd)
Definition: arbdb.cxx:729
POS_TREE1 *& TREE_ROOT1()
Definition: probe.h:389
GB_ERROR init(GBDATA *gb_species_)
Definition: PT_io.cxx:133
CONSTEXPR_INLINE unsigned char safeCharIndex(char c)
Definition: dupstr.h:73
Definition: probe.h:84
SMARTPTR access(Cache< SMARTPTR > &cache)
Definition: cache.h:275
const char * get_fullname() const
Definition: probe.h:177
std::set< const gene_struct *, ltByInternalName > gene_struct_index_internal
Definition: probe.h:455
size_t get_abspos(size_t rel_pos) const
Definition: probe.h:249
probe_struct_global psg
Definition: PT_main.cxx:36
~MostUsedPos()
Definition: probe.h:319
unsigned long ULONG
Definition: probe.h:50
Definition: probe.h:85
GBDATA * gb_main
Definition: probe.h:351
std::list< gene_struct > gene_struct_list
Definition: probe.h:454
int get_size() const
Definition: probe.h:211
~probe_input_data()
Definition: probe.h:153
void resize(size_t new_size)
Definition: cache.h:214
PT_MATCH_TYPE
Definition: probe.h:61
Stage
Definition: probe.h:122
GB_HASH * namehash
Definition: probe.h:353
probe_statistic_struct stat
Definition: probe.h:379
GB_shell * gb_shell
Definition: probe.h:350
FILE * seq
Definition: rns.c:46
gene_struct_index_internal gene_struct_internal2arb
Definition: PT_main.cxx:47
char * GB_read_bytes(GBDATA *gbd)
Definition: arbdb.cxx:974
CONSTEXPR_INLINE bool is_valid_base(char b)
Definition: probe.h:96
void reverse_probe(char *seq, int len)
Definition: probe.h:112
void enter_stage(Stage stage_)
bool outside_group() const
Definition: probe.h:218
gene_struct_index_arb gene_struct_arb2internal
Definition: PT_main.cxx:46
bool is_cached() const
Definition: cache.h:298
std::set< const gene_struct *, ltByArbName > gene_struct_index_arb
Definition: probe.h:456
Definition: probe.h:77
Definition: probe.h:122
double * pos_to_weight
Definition: probe.h:366
void announce(int p)
Definition: probe.h:328
int get_start() const
Definition: probe.h:191
#define false
Definition: ureadseq.h:13
char * main_probe
Definition: probe.h:372
const char * get_arb_gene_name() const
Definition: probe.h:437
const char * get_shortname() const
Definition: probe.h:170
Definition: probe.h:78
int get_most_used() const
Definition: probe.h:338
CONSTEXPR_INLINE bool is_std_base(char b)
Definition: probe.h:93
void assign(SMARTPTR data, Cache< SMARTPTR > &cache)
Definition: cache.h:251
#define CONSTEXPR_INLINE
Definition: cxxforward.h:111
CONSTEXPR_INLINE_Cxx14 void swap(unsigned char &c1, unsigned char &c2)
Definition: ad_io_inline.h:19
#define cmp(h1, h2)
Definition: admap.cxx:50
Definition: probe.h:86
CONSTEXPR_INLINE bool is_std_base_or_N(char b)
Definition: probe.h:94
Definition: probe.h:88
#define pt_assert(bed)
Definition: PT_tools.h:22
bool inside_group() const
Definition: probe.h:217
Definition: probe.h:79
const char * get_arb_species_name() const
Definition: probe.h:436
GB_CUINT4 * GB_read_ints_pntr(GBDATA *gbd)
Definition: arbdb.cxx:979
long get_checksum() const
Definition: probe.h:204
void set_group_state(bool isGroupMember)
Definition: probe.h:220
aisc_com * link
Definition: probe.h:375
probe_input_data()
Definition: probe.h:148
gene_struct & operator=(const gene_struct &other)
Definition: probe.h:423
CONSTEXPR_INLINE char base_2_readable(char base)
Definition: probe.h:98
PT_base
Definition: probe.h:82
POS_TREE1 * p1
Definition: probe.h:345
void preload_rel2abs() const
Definition: probe.h:229
BI_ecoli_ref * bi_ecoli
Definition: probe.h:359
size_t calc_relpos(int abs_pos) const
Definition: probe.h:254
int gene_flag
Definition: PT_main.cxx:39
Hs_struct * com_so
Definition: probe.h:377
gene_struct(const char *gene_name_, const char *arb_species_name_, const char *arb_gene_name_)
Definition: probe.h:415
const char * get_internal_gene_name() const
Definition: probe.h:435
type_types
Definition: probe.h:76
static void set_cache_sizes(size_t seq_cache_size, size_t abs_cache_size)
Definition: probe.h:158
POS_TREE2 *& TREE_ROOT2()
Definition: probe.h:390
bool valid_rel_pos(int rel_pos) const
Definition: probe.h:213
char * probe_2_readable(char *id_string, int len)
Definition: probe.h:102
#define abs(x)
Definition: f2c.h:151
Definition: probe.h:89
long get_geneabspos() const
Definition: probe.h:222
void release(Cache< SMARTPTR > &cache)
Definition: cache.h:288
MostUsedPos(int p)
Definition: probe.h:318
POS_TREE2 * p2
Definition: probe.h:346
#define NULp
Definition: cxxforward.h:116
const char * get_acc() const
Definition: probe.h:184
~gene_struct()
Definition: probe.h:431
unsigned char uchar
Definition: probe.h:52
unsigned int UINT
Definition: probe.h:51
GB_transaction ta(gb_var)
GB_CSTR GB_read_char_pntr(GBDATA *gbd)
Definition: arbdb.cxx:904
void clear()
Definition: probe.h:321
char * server_name
Definition: probe.h:374
MostUsedPos abs_pos
Definition: probe.h:368
T_PT_MAIN main
Definition: probe.h:376
bool operator()(const gene_struct *gs1, const gene_struct *gs2) const
Definition: probe.h:441
const unsigned int GB_CUINT4
Definition: arbdb_base.h:40
CONSTEXPR_INLINE bool is_ambig_base(char b)
Definition: probe.h:95
GBDATA * GB_entry(GBDATA *father, const char *key)
Definition: adquery.cxx:334
Definition: probe.h:87
MostUsedPos()
Definition: probe.h:317