ARB
PT_io.cxx
Go to the documentation of this file.
1 // =============================================================== //
2 // //
3 // File : PT_io.cxx //
4 // Purpose : //
5 // //
6 // Institute of Microbiology (Technical University Munich) //
7 // http://www.arb-home.de/ //
8 // //
9 // =============================================================== //
10 
11 
12 #include "probe.h"
13 #include "pt_prototypes.h"
14 #include "PT_compress.h"
15 
16 #include <arbdbt.h>
17 #include <BI_basepos.hxx>
18 #include <arb_progress.h>
19 #include <arb_file.h>
20 
21 int compress_data(char *probestring) {
23  char c;
24  char *src,
25  *dest;
26  dest = src = probestring;
27 
28  while ((c=*(src++))) {
29  switch (c) {
30  case 'A':
31  case 'a': *(dest++) = PT_A; break;
32  case 'C':
33  case 'c': *(dest++) = PT_C; break;
34  case 'G':
35  case 'g': *(dest++) = PT_G; break;
36  case 'U':
37  case 'u':
38  case 'T':
39  case 't': *(dest++) = PT_T; break;
40  case 'N':
41  case 'n': *(dest++) = PT_N; break;
42  default: break;
43  }
44 
45  }
46  *dest = PT_QU;
47  return 0;
48 }
49 
50 ARB_ERROR probe_read_data_base(const char *name, bool readOnly) { // goes to header: __ATTR__USERESULT
53 
54  psg.gb_shell = new GB_shell;
55 
56  if (!readOnly && !GB_is_writeablefile(name)) {
57  error = GBS_global_string("Database '%s' is write-protected - aborting", name);
58  }
59  if (!error) {
60  GBDATA *gb_main = GB_open(name, readOnly ? "r" : "rw");
61  if (!gb_main) error = GB_await_error();
62  else {
63  error = GB_begin_transaction(gb_main);
64  if (!error) psg.gb_main = gb_main;
65  error = GB_end_transaction(gb_main, error);
66  }
67  }
68  return error;
69 }
70 
71 uchar PT_compressed::translate[256];
72 bool PT_compressed::translation_initialized = false;
73 
74 #if defined(COUNT_COMPRESSES_BASES)
75 BaseCounter PT_compressed::base_counter;
76 #endif
77 
78 size_t probe_compress_sequence(char *seq, size_t seqsize) {
79  // translates a readable sequence into PT_base
80  // (see also: probe_2_readable)
81 
82  PT_compressed compressed(seqsize);
83 
84  compressed.createFrom(reinterpret_cast<unsigned char*>(seq), seqsize);
85  pt_assert(compressed.get_size() <= (seqsize+1));
86 
87  memcpy(seq, compressed.get_seq(), compressed.get_size());
88  return compressed.get_size();
89 }
90 
91 char *readable_probe(const char *compressed_probe, size_t len, char T_or_U) {
92  static SmartMallocPtr(uchar) smart_tab;
93  uchar *tab = NULp;
94 
95  if (smart_tab.isNull()) {
96  ARB_alloc(tab, 256);
97  memset(tab, '?', 256);
98 
99  tab[PT_A] = 'A';
100  tab[PT_C] = 'C';
101  tab[PT_G] = 'G';
102  tab[PT_QU] = '.';
103  tab[PT_N] = 'N';
104 
105  tab[PT_B_UNDEF] = '!';
106 
107  smart_tab = tab;
108  }
109 
110  tab = &*smart_tab;
111  tab[PT_T] = T_or_U;
112 
113  char *result = ARB_alloc<char>(len+1);
114  for (size_t i = 0; i<len; ++i) {
115  result[i] = tab[safeCharIndex(compressed_probe[i])];
116  }
117  result[len] = 0;
118  return result;
119 }
120 
121 inline GBDATA *expect_entry(GBDATA *gb_species, const char *entry_name) {
122  GBDATA *gb_entry = GB_entry(gb_species, entry_name);
123  if (!gb_entry) {
124  GB_export_errorf("Expected entry '%s' is missing for species '%s'",
125  entry_name, GBT_get_name_or_description(gb_species));
126  }
127  return gb_entry;
128 }
129 
130 cache::Cache<SmartCharPtr> probe_input_data::seq_cache(1); // resized later
131 cache::Cache<probe_input_data::SmartIntPtr> probe_input_data::rel2abs_cache(1); // resized later
132 
134  GBDATA *gb_cs = expect_entry(gb_species_, "cs");
135  GBDATA *gb_compr = expect_entry(gb_species_, "compr");
136  GBDATA *gb_baseoff = expect_entry(gb_species_, "baseoff");
137 
138  GB_ERROR error = NULp;
139  if (!gb_cs || !gb_compr || !gb_baseoff) error = GB_await_error();
140  else {
141  gb_species = gb_species_;
142  size = GB_read_count(gb_compr);
143  }
144 
145  return error;
146 }
147 
148 inline GB_ERROR PT_prepare_species_sequence(GBDATA *gb_species, const char *alignment_name, bool& data_missing, PT_compressed& compressed) {
149  GB_ERROR error = NULp;
150  GBDATA *gb_ali = GB_entry(gb_species, alignment_name);
151  GBDATA *gb_data = gb_ali ? GB_entry(gb_ali, "data") : NULp;
152 
153  data_missing = false;
154 
155  if (!gb_data) {
156  data_missing = true;
157  }
158  else {
159  const char *seq = GB_read_char_pntr(gb_data);
160  if (!seq) {
161  error = GBS_global_string("Could not read data in '%s' for species '%s'\n(Reason: %s)",
162  alignment_name, GBT_get_name_or_description(gb_species), GB_await_error());
163  }
164  else {
165  size_t seqlen = GB_read_string_count(gb_data);
166  if (seqlen>compressed.get_allowed_size()) {
167  error = GBS_global_string("Sequence too long in '%s' of '%s'\n(Hint: format alignment to fix this problem)",
168  alignment_name, GBT_get_name_or_description(gb_species));
169  }
170 
171  if (!error) {
172  compressed.createFrom(seq, seqlen);
173  {
174  uint32_t checksum = GB_checksum(seq, seqlen, 1, ".-");
175  GBDATA *gb_cs = GB_create(gb_species, "cs", GB_INT);
176  error = gb_cs
177  ? GB_write_int(gb_cs, int32_t(checksum))
178  : GB_await_error();
179  }
180  }
181 
182  if (!error) {
183  GBDATA *gb_compr = GB_create(gb_species, "compr", GB_BYTES);
184  error = gb_compr
185  ? GB_write_bytes(gb_compr, compressed.get_seq(), compressed.get_size())
186  : GB_await_error();
187  }
188 
189  if (!error) {
190  GBDATA *gb_baseoff = GB_create(gb_species, "baseoff", GB_INTS);
191  error = gb_baseoff
192  ? GB_write_ints(gb_baseoff, compressed.get_offsets(), compressed.get_size())
193  : GB_await_error();
194  }
195 
196  if (!error) error = GB_delete(gb_ali); // delete original seq data
197  }
198  }
199 
200  return error;
201 }
202 
206 
207  if (!gb_species_data) {
208  error = GB_await_error();
209  }
210  else {
211  long icount = GB_number_of_subentries(gb_species_data);
212  int data_missing = 0;
213 
214  char *ali_name = GBT_get_default_alignment(gb_main);
215  if (!ali_name) {
216  error = GB_await_error();
217  }
218  else {
219  long ali_len = GBT_get_alignment_len(gb_main, ali_name);
220  pt_assert(ali_len>0);
221 
222  PT_compressed compressBuffer(ali_len);
223 
224  printf("Database contains %li species\n", icount);
225  {
226  {
227  arb_progress progress("Preparing sequence data", icount);
228  for (GBDATA *gb_species = GBT_first_species_rel_species_data(gb_species_data);
229  gb_species && !error;
230  )
231  {
232  GBDATA *gb_next = GBT_next_species(gb_species);
233  bool no_data;
234 
235  error = PT_prepare_species_sequence(gb_species, ali_name, no_data, compressBuffer);
236  if (no_data) {
237  pt_assert(!error);
238  data_missing++;
239  error = GB_delete(gb_species);
240  }
241  progress.inc();
242  gb_species = gb_next;
243  }
244  if (error) progress.done();
245  }
246 
247  if (!error) {
248  char *master_data_name = GBS_global_string_copy("%s/@master_data", GB_SYSTEM_FOLDER);
249  GBDATA *gb_master_data = GB_search(gb_main, master_data_name, GB_FIND);
250  if (gb_master_data) error = GB_delete(gb_master_data);
251  free(master_data_name);
252  }
253  }
254  if (data_missing) {
255  printf("\n%i species were ignored because of missing data.\n", data_missing);
256  }
257  else {
258  printf("\nAll species contain data in alignment '%s'.\n", ali_name);
259  }
260  fflush_all();
261  free(ali_name);
262  }
263  }
264 
265  error = GB_end_transaction(gb_main, error);
266  return error;
267 }
268 
270  // reads sequence data into psg.data
271 
273 
274  // read ref SAI (e.g. ecoli)
275  {
276  char *def_ref = GBT_get_default_ref(psg.gb_main);
277  GBDATA *gb_sai_data = GBT_get_SAI_data(psg.gb_main);
278  GBDATA *gb_ref = GBT_find_SAI_rel_SAI_data(gb_sai_data, def_ref);
279 
280  psg.ecoli = NULp;
281  if (gb_ref) {
282  GBDATA *gb_data = GBT_find_sequence(gb_ref, psg.alignment_name);
283  if (gb_data) {
284  psg.ecoli = GB_read_string(gb_data); // @@@ NOT_ALL_SAI_HAVE_DATA
285  }
286  }
287  free(def_ref);
288  }
289 
291  long icount = GB_number_of_subentries(gb_species_data);
292 
293  psg.data = new probe_input_data[icount];
294  psg.data_count = 0;
295 
296  printf("Database contains %li species\n", icount);
297 
298  GB_ERROR error = NULp;
299  {
300  arb_progress progress("Checking data", icount);
301  int count = 0;
302 
303  for (GBDATA *gb_species = GBT_first_species_rel_species_data(gb_species_data);
304  gb_species;
305  gb_species = GBT_next_species(gb_species))
306  {
307  probe_input_data& pid = psg.data[count];
308 
309  error = pid.init(gb_species);
310  if (error) break;
311  count++;
312  progress.inc();
313  }
314 
315  psg.data_count = count;
317 
318  if (error) progress.done();
319  }
320 
321  fflush_all();
322  return error;
323 }
324 
326  long i;
328  for (i=0; i<psg.data_count; i++) {
330  }
331  unsigned int max_size;
332  max_size = 0;
333  for (i = 0; i < psg.data_count; i++) { // get max sequence len
334  max_size = std::max(max_size, (unsigned)(psg.data[i].get_size()));
335  psg.char_count += psg.data[i].get_size();
336  }
337  psg.max_size = max_size;
338 
339  if (psg.ecoli) {
340  BI_ecoli_ref *ref = new BI_ecoli_ref;
341  ref->init(psg.ecoli, strlen(psg.ecoli));
342  psg.bi_ecoli = ref;
343  }
344 }
345 
346 
347 long PT_abs_2_ecoli_rel(long pos) {
348  if (!psg.ecoli) return pos;
349  return psg.bi_ecoli->abs_2_rel(pos);
350 }
351 
struct probe_input_data * data
Definition: probe.h:356
GB_ERROR GB_begin_transaction(GBDATA *gbd)
Definition: arbdb.cxx:2528
char * alignment_name
Definition: probe.h:352
const char * GB_ERROR
Definition: arb_core.h:25
#define GB_SYSTEM_FOLDER
Definition: arbdb.h:27
string result
GB_ERROR PT_prepare_species_sequence(GBDATA *gb_species, const char *alignment_name, bool &data_missing, PT_compressed &compressed)
Definition: PT_io.cxx:148
GBDATA * GB_open(const char *path, const char *opent)
Definition: ad_load.cxx:1363
GB_ERROR GB_commit_transaction(GBDATA *gbd)
Definition: arbdb.cxx:2551
static GB_ERROR tab(GBL_command_arguments *args, bool pretab)
Definition: adlang1.cxx:914
Definition: probe.h:83
void fflush_all()
Definition: PT_tools.h:211
GB_ERROR GB_write_bytes(GBDATA *gbd, const char *s, long size)
Definition: arbdb.cxx:1434
long GBS_write_hash(GB_HASH *hs, const char *key, long val)
Definition: adhash.cxx:454
GB_ERROR init(GBDATA *gb_species_)
Definition: PT_io.cxx:133
CONSTEXPR_INLINE unsigned char safeCharIndex(char c)
Definition: dupstr.h:73
Definition: probe.h:84
GB_ERROR GB_end_transaction(GBDATA *gbd, GB_ERROR error)
Definition: arbdb.cxx:2561
probe_struct_global psg
Definition: PT_main.cxx:36
const unsigned * get_offsets() const
Definition: PT_compress.h:184
GB_ERROR PT_prepare_data(GBDATA *gb_main)
Definition: PT_io.cxx:203
Definition: probe.h:85
GBDATA * gb_main
Definition: probe.h:351
const char * GBS_global_string(const char *templat,...)
Definition: arb_msg.cxx:203
int get_size() const
Definition: probe.h:211
long GBT_get_alignment_len(GBDATA *gb_main, const char *aliname)
Definition: adali.cxx:833
static char * alignment_name
GB_HASH * namehash
Definition: probe.h:353
uint32_t GB_checksum(const char *seq, long length, int ignore_case, const char *exclude)
Definition: adstring.cxx:319
GB_shell * gb_shell
Definition: probe.h:350
FILE * seq
Definition: rns.c:46
size_t get_allowed_size() const
Definition: PT_compress.h:182
const char * get_seq() const
Definition: PT_compress.h:183
GBDATA * expect_entry(GBDATA *gb_species, const char *entry_name)
Definition: PT_io.cxx:121
void createFrom(const unsigned char *const seq, const size_t length)
Definition: PT_compress.h:120
GB_ERROR GB_delete(GBDATA *&source)
Definition: arbdb.cxx:1916
GBDATA * GBT_first_species_rel_species_data(GBDATA *gb_species_data)
Definition: aditem.cxx:121
size_t GB_read_string_count(GBDATA *gbd)
Definition: arbdb.cxx:916
GB_ERROR GB_await_error()
Definition: arb_msg.cxx:342
long GB_read_count(GBDATA *gbd)
Definition: arbdb.cxx:758
int abs_2_rel(int abs) const
Definition: BI_basepos.hxx:76
GB_ERROR PT_init_input_data()
Definition: PT_io.cxx:269
TYPE * ARB_alloc(size_t nelem)
Definition: arb_mem.h:56
size_t probe_compress_sequence(char *seq, size_t seqsize)
Definition: PT_io.cxx:78
GBDATA * GB_create(GBDATA *father, const char *key, GB_TYPES type)
Definition: arbdb.cxx:1781
GBDATA * gb_species_data
Definition: adname.cxx:33
long GB_number_of_subentries(GBDATA *gbd)
Definition: arbdb.cxx:2892
char * readable_probe(const char *compressed_probe, size_t len, char T_or_U)
Definition: PT_io.cxx:91
int compress_data(char *probestring)
Definition: PT_io.cxx:21
#define SmartMallocPtr(type)
Definition: smartptr.h:45
static void error(const char *msg)
Definition: mkptypes.cxx:96
const char * get_shortname() const
Definition: probe.h:170
long PT_abs_2_ecoli_rel(long pos)
Definition: PT_io.cxx:347
ARB_ERROR probe_read_data_base(const char *name, bool readOnly)
Definition: PT_io.cxx:50
void PT_build_species_hash()
Definition: PT_io.cxx:325
void init(const char *seq, int size)
Definition: BI_basepos.hxx:109
Definition: probe.h:86
Definition: probe.h:88
#define pt_assert(bed)
Definition: PT_tools.h:22
char * GBT_get_default_ref(GBDATA *)
Definition: adtools.cxx:66
Definition: arbdb.h:86
GBDATA * GBT_find_sequence(GBDATA *gb_species, const char *aliname)
Definition: adali.cxx:708
GB_ERROR GB_write_int(GBDATA *gbd, long i)
Definition: arbdb.cxx:1250
bool GB_is_writeablefile(const char *filename)
Definition: arb_file.cxx:148
Definition: arbdb.h:72
GB_ERROR GB_export_errorf(const char *templat,...)
Definition: arb_msg.cxx:262
BI_ecoli_ref * bi_ecoli
Definition: probe.h:359
GB_ERROR GB_write_ints(GBDATA *gbd, const GB_UINT4 *i, long size)
Definition: arbdb.cxx:1439
char * GB_read_string(GBDATA *gbd)
Definition: arbdb.cxx:909
size_t get_size() const
Definition: PT_compress.h:181
void GB_set_verbose()
Definition: ad_load.cxx:31
unsigned char uchar
Definition: gde.hxx:21
GBDATA * GBT_next_species(GBDATA *gb_species)
Definition: aditem.cxx:128
#define NULp
Definition: cxxforward.h:116
char * GBT_get_default_alignment(GBDATA *gb_main)
Definition: adali.cxx:747
GBDATA * GBT_find_SAI_rel_SAI_data(GBDATA *gb_sai_data, const char *name)
Definition: aditem.cxx:171
GB_CSTR GB_read_char_pntr(GBDATA *gbd)
Definition: arbdb.cxx:904
GBDATA * gb_main
Definition: adname.cxx:32
Definition: arbdb.h:71
GBDATA * GBT_get_SAI_data(GBDATA *gb_main)
Definition: aditem.cxx:154
GBDATA * GB_search(GBDATA *gbd, const char *fieldpath, GB_TYPES create)
Definition: adquery.cxx:531
GB_CSTR GBT_get_name_or_description(GBDATA *gb_item)
Definition: aditem.cxx:459
GBDATA * GB_entry(GBDATA *father, const char *key)
Definition: adquery.cxx:334
Definition: probe.h:87
char * GBS_global_string_copy(const char *templat,...)
Definition: arb_msg.cxx:194
GB_HASH * GBS_create_hash(long estimated_elements, GB_CASE case_sens)
Definition: adhash.cxx:253
Definition: arbdb.h:66
GBDATA * GBT_get_species_data(GBDATA *gb_main)
Definition: aditem.cxx:105
#define max(a, b)
Definition: f2c.h:154