ARB
PT_io.cxx
Go to the documentation of this file.
1 // =============================================================== //
2 // //
3 // File : PT_io.cxx //
4 // Purpose : //
5 // //
6 // Institute of Microbiology (Technical University Munich) //
7 // http://www.arb-home.de/ //
8 // //
9 // =============================================================== //
10 
11 
12 #include "probe.h"
13 #include "pt_prototypes.h"
14 #include "PT_compress.h"
15 
16 #include <arbdbt.h>
17 #include <BI_basepos.hxx>
18 #include <arb_progress.h>
19 #include <arb_file.h>
20 
21 int compress_data(char *probestring) {
23  char c;
24  char *src,
25  *dest;
26  dest = src = probestring;
27 
28  while ((c=*(src++))) {
29  switch (c) {
30  case 'A':
31  case 'a': *(dest++) = PT_A; break;
32  case 'C':
33  case 'c': *(dest++) = PT_C; break;
34  case 'G':
35  case 'g': *(dest++) = PT_G; break;
36  case 'U':
37  case 'u':
38  case 'T':
39  case 't': *(dest++) = PT_T; break;
40  case 'N':
41  case 'n': *(dest++) = PT_N; break;
42  default: break;
43  }
44 
45  }
46  *dest = PT_QU;
47  return 0;
48 }
49 
50 ARB_ERROR probe_read_data_base(const char *name, bool readOnly) { // goes to header: __ATTR__USERESULT
53 
54  psg.gb_shell = new GB_shell;
55 
56  if (!readOnly && !GB_is_writeablefile(name)) {
57  error = GBS_global_string("Database '%s' is write-protected - aborting", name);
58  }
59  if (!error) {
60  GBDATA *gb_main = GB_open(name, readOnly ? "r" : "rw");
61  if (!gb_main) error = GB_await_error();
62  else {
63  error = GB_begin_transaction(gb_main);
64  if (!error) psg.gb_main = gb_main;
65  error = GB_end_transaction(gb_main, error);
66  }
67  }
68  return error;
69 }
70 
71 uchar PT_compressed::translate[256];
72 bool PT_compressed::translation_initialized = false;
73 
74 #if defined(COUNT_COMPRESSES_BASES)
75 BaseCounter PT_compressed::base_counter;
76 #endif
77 
78 size_t probe_compress_sequence(char *seq, size_t seqsize) {
79  // translates a readable sequence into PT_base
80  // (see also: probe_2_readable)
81 
82  PT_compressed compressed(seqsize);
83 
84  compressed.createFrom(reinterpret_cast<unsigned char*>(seq), seqsize);
85  pt_assert(compressed.get_size() <= (seqsize+1));
86 
87  memcpy(seq, compressed.get_seq(), compressed.get_size());
88  return compressed.get_size();
89 }
90 
91 char *readable_probe(const char *compressed_probe, size_t len, char T_or_U) {
92  static SmartMallocPtr(uchar) smart_tab;
93  uchar *tab = NULp;
94 
95  if (smart_tab.isNull()) {
96  ARB_alloc(tab, 256);
97  memset(tab, '?', 256);
98 
99  tab[PT_A] = 'A';
100  tab[PT_C] = 'C';
101  tab[PT_G] = 'G';
102  tab[PT_QU] = '.';
103  tab[PT_N] = 'N';
104 
105  tab[PT_B_UNDEF] = '!';
106 
107  smart_tab = tab;
108  }
109 
110  tab = &*smart_tab;
111  tab[PT_T] = T_or_U;
112 
113  char *result = ARB_alloc<char>(len+1);
114  for (size_t i = 0; i<len; ++i) {
115  result[i] = tab[safeCharIndex(compressed_probe[i])];
116  }
117  result[len] = 0;
118  return result;
119 }
120 
121 inline GBDATA *expect_entry(GBDATA *gb_species, const char *entry_name) {
122  GBDATA *gb_entry = GB_entry(gb_species, entry_name);
123  if (!gb_entry) {
124  GB_export_errorf("Expected entry '%s' is missing for species '%s'",
125  entry_name, GBT_get_name_or_description(gb_species));
126  }
127  return gb_entry;
128 }
129 
130 cache::Cache<SmartCharPtr> probe_input_data::seq_cache(1); // resized later
131 cache::Cache<probe_input_data::SmartIntPtr> probe_input_data::rel2abs_cache(1); // resized later
132 
134  GBDATA *gb_cs = expect_entry(gb_species_, "cs");
135  GBDATA *gb_compr = expect_entry(gb_species_, "compr");
136  GBDATA *gb_baseoff = expect_entry(gb_species_, "baseoff");
137 
138  GB_ERROR error = NULp;
139  if (!gb_cs || !gb_compr || !gb_baseoff) error = GB_await_error();
140  else {
141  gb_species = gb_species_;
142  size = GB_read_count(gb_compr);
143  }
144 
145  return error;
146 }
147 
148 inline GB_ERROR PT_prepare_species_sequence(GBDATA *gb_species, const char *alignment_name, bool& data_missing, PT_compressed& compressed) {
149  GB_ERROR error = NULp;
150  GBDATA *gb_ali = GB_entry(gb_species, alignment_name);
151  GBDATA *gb_data = gb_ali ? GB_entry(gb_ali, "data") : NULp;
152 
153  data_missing = false;
154 
155  if (!gb_data) {
156  data_missing = true;
157  }
158  else {
159  const char *seq = GB_read_char_pntr(gb_data);
160  if (!seq) {
161  error = GBS_global_string("Could not read data in '%s' for species '%s'\n(Reason: %s)",
162  alignment_name, GBT_get_name_or_description(gb_species), GB_await_error());
163  }
164  else {
165  size_t seqlen = GB_read_string_count(gb_data);
166  if (seqlen>compressed.get_allowed_size()) {
167  error = GBS_global_string("Sequence too long in '%s' of '%s'\n(Hint: format alignment to fix this problem)",
168  alignment_name, GBT_get_name_or_description(gb_species));
169  }
170 
171  if (!error) {
172  compressed.createFrom(seq, seqlen);
173  {
174  uint32_t checksum = GB_checksum(seq, seqlen, 1, ".-");
175  GBDATA *gb_cs = GB_create(gb_species, "cs", GB_INT);
176  error = gb_cs
177  ? GB_write_int(gb_cs, int32_t(checksum))
178  : GB_await_error();
179  }
180  }
181 
182  if (!error) {
183  GBDATA *gb_compr = GB_create(gb_species, "compr", GB_BYTES);
184  error = gb_compr
185  ? GB_write_bytes(gb_compr, compressed.get_seq(), compressed.get_size())
186  : GB_await_error();
187  }
188 
189  if (!error) {
190  GBDATA *gb_baseoff = GB_create(gb_species, "baseoff", GB_INTS);
191  error = gb_baseoff
192  ? GB_write_ints(gb_baseoff, compressed.get_offsets(), compressed.get_size())
193  : GB_await_error();
194  }
195 
196  if (!error) error = GB_delete(gb_ali); // delete original seq data
197  }
198  }
199 
200  return error;
201 }
202 
206 
207  if (!gb_species_data) {
208  error = GB_await_error();
209  }
210  else {
211  long icount = GB_number_of_subentries(gb_species_data);
212  int data_missing = 0;
213 
214  char *ali_name = GBT_get_default_alignment(gb_main);
215  long ali_len = GBT_get_alignment_len(gb_main, ali_name);
216 
217  PT_compressed compressBuffer(ali_len);
218 
219  printf("Database contains %li species\n", icount);
220  {
221  {
222  arb_progress progress("Preparing sequence data", icount);
223  for (GBDATA *gb_species = GBT_first_species_rel_species_data(gb_species_data);
224  gb_species && !error;
225  )
226  {
227  GBDATA *gb_next = GBT_next_species(gb_species);
228  bool no_data;
229 
230  error = PT_prepare_species_sequence(gb_species, ali_name, no_data, compressBuffer);
231  if (no_data) {
232  pt_assert(!error);
233  data_missing++;
234  error = GB_delete(gb_species);
235  }
236  progress.inc();
237  gb_species = gb_next;
238  }
239  if (error) progress.done();
240  }
241 
242  if (!error) {
243  char *master_data_name = GBS_global_string_copy("%s/@master_data", GB_SYSTEM_FOLDER);
244  GBDATA *gb_master_data = GB_search(gb_main, master_data_name, GB_FIND);
245  if (gb_master_data) error = GB_delete(gb_master_data);
246  free(master_data_name);
247  }
248  }
249  if (data_missing) {
250  printf("\n%i species were ignored because of missing data.\n", data_missing);
251  }
252  else {
253  printf("\nAll species contain data in alignment '%s'.\n", ali_name);
254  }
255  fflush_all();
256  free(ali_name);
257  }
258 
259  error = GB_end_transaction(gb_main, error);
260  return error;
261 }
262 
264  // reads sequence data into psg.data
265 
267 
268  // read ref SAI (e.g. ecoli)
269  {
270  char *def_ref = GBT_get_default_ref(psg.gb_main);
271  GBDATA *gb_sai_data = GBT_get_SAI_data(psg.gb_main);
272  GBDATA *gb_ref = GBT_find_SAI_rel_SAI_data(gb_sai_data, def_ref);
273 
274  psg.ecoli = NULp;
275  if (gb_ref) {
276  GBDATA *gb_data = GBT_find_sequence(gb_ref, psg.alignment_name);
277  if (gb_data) {
278  psg.ecoli = GB_read_string(gb_data);
279  }
280  }
281  free(def_ref);
282  }
283 
285  long icount = GB_number_of_subentries(gb_species_data);
286 
287  psg.data = new probe_input_data[icount];
288  psg.data_count = 0;
289 
290  printf("Database contains %li species\n", icount);
291 
292  GB_ERROR error = NULp;
293  {
294  arb_progress progress("Checking data", icount);
295  int count = 0;
296 
297  for (GBDATA *gb_species = GBT_first_species_rel_species_data(gb_species_data);
298  gb_species;
299  gb_species = GBT_next_species(gb_species))
300  {
301  probe_input_data& pid = psg.data[count];
302 
303  error = pid.init(gb_species);
304  if (error) break;
305  count++;
306  progress.inc();
307  }
308 
309  psg.data_count = count;
311 
312  if (error) progress.done();
313  }
314 
315  fflush_all();
316  return error;
317 }
318 
320  long i;
322  for (i=0; i<psg.data_count; i++) {
324  }
325  unsigned int max_size;
326  max_size = 0;
327  for (i = 0; i < psg.data_count; i++) { // get max sequence len
328  max_size = std::max(max_size, (unsigned)(psg.data[i].get_size()));
329  psg.char_count += psg.data[i].get_size();
330  }
331  psg.max_size = max_size;
332 
333  if (psg.ecoli) {
334  BI_ecoli_ref *ref = new BI_ecoli_ref;
335  ref->init(psg.ecoli, strlen(psg.ecoli));
336  psg.bi_ecoli = ref;
337  }
338 }
339 
340 
341 long PT_abs_2_ecoli_rel(long pos) {
342  if (!psg.ecoli) return pos;
343  return psg.bi_ecoli->abs_2_rel(pos);
344 }
345 
346 // --------------------------------------------------------------------------------
347 
348 #ifdef UNIT_TESTS
349 #ifndef TEST_UNIT_H
350 #include <test_unit.h>
351 #endif
352 
353 inline int *intcopy(int i) { int *ip = new int; *ip = i; return ip; }
354 
355 #define CACHED(p,t) that(p.is_cached()).is_equal_to(t)
356 #define CACHED_p123(t1,t2,t3) all().of(CACHED(p1, t1), CACHED(p2, t2), CACHED(p3, t3))
357 
358 void TEST_CachedPtr() {
359  using namespace cache;
360  {
361  typedef SmartPtr<int> IntPtr;
362 
366 
367  {
368  Cache<IntPtr> cache(2);
369  TEST_EXPECT_ZERO(cache.entries()); // nothing cached yet
370 
371  p1.assign(intcopy(1), cache);
372  TEST_EXPECT_EQUAL(*p1.access(cache), 1);
373  TEST_EXPECT_EQUAL(cache.entries(), 1);
374  TEST_EXPECTATION(CACHED_p123(true, false, false));
375 
376  p2.assign(intcopy(2), cache);
377  TEST_EXPECT_EQUAL(*p2.access(cache), 2);
378  TEST_EXPECT_EQUAL(cache.entries(), 2);
379  TEST_EXPECTATION(CACHED_p123(true, true, false));
380 
381  p3.assign(intcopy(3), cache);
382  TEST_EXPECT_EQUAL(*p3.access(cache), 3);
383  TEST_EXPECT_EQUAL(cache.entries(), 2);
384  TEST_EXPECTATION(CACHED_p123(false, true, true)); // p1 has been invalidated by caching p3
385 
386  p3.assign(intcopy(33), cache); // test re-assignment
387  TEST_EXPECT_EQUAL(*p3.access(cache), 33);
388  TEST_EXPECT_EQUAL(cache.entries(), 2);
389  TEST_EXPECTATION(CACHED_p123(false, true, true)); // p2 still cached
390 
391  TEST_EXPECT_EQUAL(*p2.access(cache), 2); // should make p2 the LRU cache entry
392  TEST_EXPECT_EQUAL(cache.entries(), 2);
393  TEST_EXPECTATION(CACHED_p123(false, true, true)); // p2 still cached
394 
395  IntPtr s4;
396  {
398  p4.assign(intcopy(4), cache);
399  TEST_EXPECT_EQUAL(*p4.access(cache), 4);
400  TEST_EXPECT_EQUAL(cache.entries(), 2);
401 
402  s4 = p4.access(cache); // keep data of p4 in s4
403  TEST_EXPECT_EQUAL(s4.references(), 2); // ref'd by s4 and p4
404 
405  p4.release(cache); // need to release p4 before destruction (otherwise assertion fails)
406  }
407 
408  TEST_EXPECT_EQUAL(*s4, 4); // check kept value of deleted CacheHandle
409  TEST_EXPECT_EQUAL(s4.references(), 1); // only ref'd by s4
410 
411  TEST_EXPECT_EQUAL(cache.entries(), 1); // contains only p2 (p4 has been released)
412  TEST_EXPECTATION(CACHED_p123(false, true, false)); // p3 has been invalidated by caching p4
413  }
414 
415  TEST_EXPECTATION(CACHED_p123(false, false, false)); // Cache was destroyed = > all CacheHandle will be invalid
416  // no need to release p1..p3 (due cache was destroyed)
417  }
418 
419  // test cache of SmartCharPtr
420  {
422 
423  {
424  const int P = 4;
426  const char *word[] = { "apple", "orange", "pie", "juice" };
427 
428  for (int i = 0; i<P; ++i) p[i].assign(ARB_strdup(word[i]), cache);
429  TEST_REJECT(p[0].is_cached());
430  for (int i = 1; i<P; ++i) TEST_EXPECT_EQUAL(&*p[i].access(cache), word[i]);
431 
432  TEST_REJECT(p[0].is_cached());
433  TEST_EXPECT(p[1].is_cached()); // oldest entry
434 
435  cache.resize(cache.size()-1);
436  TEST_REJECT(p[1].is_cached()); // invalidated by resize
437 
438  for (int i = P-1; i >= 0; --i) p[i].assign(ARB_strdup(word[P-1-i]), cache);
439 
440  for (int i = 0; i<2; ++i) TEST_EXPECT_EQUAL(&*p[i].access(cache), word[P-1-i]);
441  for (int i = 2; i<P; ++i) TEST_REJECT(p[i].is_cached());
442 
443  cache.flush();
444  }
445  }
446 }
447 
448 #endif // UNIT_TESTS
449 
450 // --------------------------------------------------------------------------------
struct probe_input_data * data
Definition: probe.h:356
GB_ERROR GB_begin_transaction(GBDATA *gbd)
Definition: arbdb.cxx:2492
char * alignment_name
Definition: probe.h:352
const char * GB_ERROR
Definition: arb_core.h:25
#define GB_SYSTEM_FOLDER
Definition: arbdb.h:27
string result
GB_ERROR PT_prepare_species_sequence(GBDATA *gb_species, const char *alignment_name, bool &data_missing, PT_compressed &compressed)
Definition: PT_io.cxx:148
GBDATA * GB_open(const char *path, const char *opent)
Definition: ad_load.cxx:1363
GB_ERROR GB_commit_transaction(GBDATA *gbd)
Definition: arbdb.cxx:2515
static GB_ERROR tab(GBL_command_arguments *args, bool pretab)
Definition: adlang1.cxx:913
Definition: probe.h:83
void fflush_all()
Definition: PT_tools.h:211
GB_ERROR GB_write_bytes(GBDATA *gbd, const char *s, long size)
Definition: arbdb.cxx:1408
long GBS_write_hash(GB_HASH *hs, const char *key, long val)
Definition: adhash.cxx:457
GB_ERROR init(GBDATA *gb_species_)
Definition: PT_io.cxx:133
CONSTEXPR_INLINE unsigned char safeCharIndex(char c)
Definition: dupstr.h:73
Definition: probe.h:84
SMARTPTR access(Cache< SMARTPTR > &cache)
Definition: cache.h:275
GB_ERROR GB_end_transaction(GBDATA *gbd, GB_ERROR error)
Definition: arbdb.cxx:2525
probe_struct_global psg
Definition: PT_main.cxx:36
const unsigned * get_offsets() const
Definition: PT_compress.h:184
GB_ERROR PT_prepare_data(GBDATA *gb_main)
Definition: PT_io.cxx:203
char * ARB_strdup(const char *str)
Definition: arb_string.h:27
Definition: probe.h:85
GBDATA * gb_main
Definition: probe.h:351
const char * GBS_global_string(const char *templat,...)
Definition: arb_msg.cxx:204
int get_size() const
Definition: probe.h:211
long GBT_get_alignment_len(GBDATA *gb_main, const char *aliname)
Definition: adali.cxx:706
static char * alignment_name
GB_HASH * namehash
Definition: probe.h:353
uint32_t GB_checksum(const char *seq, long length, int ignore_case, const char *exclude)
Definition: adstring.cxx:320
GB_shell * gb_shell
Definition: probe.h:350
FILE * seq
Definition: rns.c:46
size_t get_allowed_size() const
Definition: PT_compress.h:182
const char * get_seq() const
Definition: PT_compress.h:183
GBDATA * expect_entry(GBDATA *gb_species, const char *entry_name)
Definition: PT_io.cxx:121
void createFrom(const unsigned char *const seq, const size_t length)
Definition: PT_compress.h:120
GB_ERROR GB_delete(GBDATA *&source)
Definition: arbdb.cxx:1880
GBDATA * GBT_first_species_rel_species_data(GBDATA *gb_species_data)
Definition: aditem.cxx:121
size_t GB_read_string_count(GBDATA *gbd)
Definition: arbdb.cxx:886
GB_ERROR GB_await_error()
Definition: arb_msg.cxx:353
#define TEST_EXPECT(cond)
Definition: test_unit.h:1312
long GB_read_count(GBDATA *gbd)
Definition: arbdb.cxx:728
int abs_2_rel(int abs) const
Definition: BI_basepos.hxx:76
GB_ERROR PT_init_input_data()
Definition: PT_io.cxx:263
TYPE * ARB_alloc(size_t nelem)
Definition: arb_mem.h:56
size_t probe_compress_sequence(char *seq, size_t seqsize)
Definition: PT_io.cxx:78
GBDATA * GB_create(GBDATA *father, const char *key, GB_TYPES type)
Definition: arbdb.cxx:1755
GBDATA * gb_species_data
Definition: adname.cxx:34
long GB_number_of_subentries(GBDATA *gbd)
Definition: arbdb.cxx:2856
Generic smart pointer.
Definition: smartptr.h:149
char * readable_probe(const char *compressed_probe, size_t len, char T_or_U)
Definition: PT_io.cxx:91
int compress_data(char *probestring)
Definition: PT_io.cxx:21
#define TEST_REJECT(cond)
Definition: test_unit.h:1314
#define SmartMallocPtr(type)
Definition: smartptr.h:45
static void error(const char *msg)
Definition: mkptypes.cxx:96
const char * get_shortname() const
Definition: probe.h:170
long PT_abs_2_ecoli_rel(long pos)
Definition: PT_io.cxx:341
ARB_ERROR probe_read_data_base(const char *name, bool readOnly)
Definition: PT_io.cxx:50
void assign(SMARTPTR data, Cache< SMARTPTR > &cache)
Definition: cache.h:251
void PT_build_species_hash()
Definition: PT_io.cxx:319
void init(const char *seq, int size)
Definition: BI_basepos.hxx:109
Definition: probe.h:86
Definition: probe.h:88
#define pt_assert(bed)
Definition: PT_tools.h:22
char * GBT_get_default_ref(GBDATA *)
Definition: adtools.cxx:66
Definition: arbdb.h:86
GBDATA * GBT_find_sequence(GBDATA *gb_species, const char *aliname)
Definition: adali.cxx:670
Handle for Cache entries.
Definition: cache.h:33
GB_ERROR GB_write_int(GBDATA *gbd, long i)
Definition: arbdb.cxx:1220
#define TEST_EXPECT_ZERO(cond)
Definition: test_unit.h:1074
bool GB_is_writeablefile(const char *filename)
Definition: arb_file.cxx:148
#define TEST_EXPECTATION(EXPCTN)
Definition: test_unit.h:1037
Definition: arbdb.h:72
GB_ERROR GB_export_errorf(const char *templat,...)
Definition: arb_msg.cxx:264
BI_ecoli_ref * bi_ecoli
Definition: probe.h:359
GB_ERROR GB_write_ints(GBDATA *gbd, const GB_UINT4 *i, long size)
Definition: arbdb.cxx:1413
char * GB_read_string(GBDATA *gbd)
Definition: arbdb.cxx:879
void release(Cache< SMARTPTR > &cache)
Definition: cache.h:288
size_t get_size() const
Definition: PT_compress.h:181
void GB_set_verbose()
Definition: ad_load.cxx:31
unsigned char uchar
Definition: gde.hxx:21
GBDATA * GBT_next_species(GBDATA *gb_species)
Definition: aditem.cxx:128
#define NULp
Definition: cxxforward.h:97
char * GBT_get_default_alignment(GBDATA *gb_main)
Definition: adali.cxx:675
GBDATA * GBT_find_SAI_rel_SAI_data(GBDATA *gb_sai_data, const char *name)
Definition: aditem.cxx:169
GB_CSTR GB_read_char_pntr(GBDATA *gbd)
Definition: arbdb.cxx:874
GBDATA * gb_main
Definition: adname.cxx:33
Definition: arbdb.h:71
GBDATA * GBT_get_SAI_data(GBDATA *gb_main)
Definition: aditem.cxx:152
GBDATA * GB_search(GBDATA *gbd, const char *fieldpath, GB_TYPES create)
Definition: adquery.cxx:531
GB_CSTR GBT_get_name_or_description(GBDATA *gb_item)
Definition: aditem.cxx:437
static Params P
Definition: arb_probe.cxx:81
#define TEST_EXPECT_EQUAL(expr, want)
Definition: test_unit.h:1283
GBDATA * GB_entry(GBDATA *father, const char *key)
Definition: adquery.cxx:334
Definition: probe.h:87
char * GBS_global_string_copy(const char *templat,...)
Definition: arb_msg.cxx:195
GB_HASH * GBS_create_hash(long estimated_elements, GB_CASE case_sens)
Definition: adhash.cxx:253
Definition: cache.h:31
Definition: arbdb.h:66
GBDATA * GBT_get_species_data(GBDATA *gb_main)
Definition: aditem.cxx:105
#define max(a, b)
Definition: f2c.h:154