ARB
ps_convert_db.cxx
Go to the documentation of this file.
1 // =============================================================== //
2 // //
3 // File : ps_convert_db.cxx //
4 // Purpose : //
5 // //
6 // Coded by Wolfram Foerster in October 2002 //
7 // Institute of Microbiology (Technical University Munich) //
8 // http://www.arb-home.de/ //
9 // //
10 // =============================================================== //
11 
12 #include "ps_tools.hxx"
13 #include "ps_database.hxx"
14 #include "ps_pg_tree_functions.hxx"
15 #include "ps_pg_specmap.hxx"
16 
17 #include <sys/times.h>
18 
19 
20 // GLOBALS
21 
23 static int __PROBE_LENGTH;
26 
27 static void PS_detect_probe_length(GBDATA *_ARB_node) {
28  // recursively walk through database to first probe and get its length
29  __PROBE_LENGTH = -1;
30 
31  // search for a probe
32  while (__PROBE_LENGTH < 0) {
33  GBDATA *ARB_group = GB_entry(_ARB_node, "group");
34  if (ARB_group) { // ps_node has probes
35  GBDATA *probe = PG_get_first_probe(ARB_group);
36  __PROBE_LENGTH = strlen(PG_read_probe(probe));
37  }
38  else { // ps_node has no probes .. check its children
39  GBDATA *ARB_child = PS_get_first_node(_ARB_node);
40  while (ARB_child && (__PROBE_LENGTH < 0)) {
41  PS_detect_probe_length(ARB_child);
42  ARB_child = PS_get_next_node(ARB_child);
43  }
44  }
45  }
46 }
47 
48 PS_NodePtr PS_assert_inverse_path(const int _max_depth, const int _caller_ID, IDVector *_path) {
49  // walk down the 'inverse path' creating empty nodes as necessary
50 
51  PS_NodePtr current_node = __ROOT;
52  SpeciesID current_ID;
53 
54  // handle given path
55  int c = 0;
56  for (IDVectorCIter i = _path->begin(); i != _path->end(); ++i, ++c) {
57  current_ID = *i;
58  current_node = current_node->assertChild(current_ID);
59  }
60 
61  // handle implicit path
62  c = 0;
63  for (current_ID = _caller_ID+1; current_ID <= _max_depth; ++current_ID, ++c) {
64  current_node = current_node->assertChild(current_ID);
65  }
66 
67  return current_node;
68 }
69 
70 PS_NodePtr PS_assert_path(const int _caller_ID, IDVector *_path) {
71  // walk down the 'path' creating empty nodes as necessary
72 
73  PS_NodePtr current_node = __ROOT;
74  SpeciesID next_path_ID;
75 
76  // handle given path
77  int c = 0;
78  IDVectorCIter i = _path->begin();
79  next_path_ID = (i == _path->end()) ? -1 : *i;
80  for (SpeciesID current_ID = __MIN_ID; current_ID <= _caller_ID; ++current_ID, ++c) {
81  if (current_ID != next_path_ID) {
82  current_node = current_node->assertChild(current_ID);
83  }
84  else {
85  ++i;
86  next_path_ID = (i == _path->end()) ? -1 : *i;
87  }
88  }
89 
90  return current_node;
91 }
92 
93 void PS_extract_probe_data(GBDATA *_ARB_node, // position in ARB database
94  int _max_depth, // count of species in ARB database
95  int _depth, // current depth in tree
96  const int _parent_ID, // SpeciesID of parent node
97  IDVector *_inverse_path) { // list with IDs of the 'inverse path'
98 
99  // recursively walk through ARB-database and extract probe-data to own tree format
100  //
101  // * Insertion of nodes takes place after a branch is completed (that is
102  // when ive reached a leaf in the ARB-database and im going 'back up'
103  // out of the recursion).
104  //
105  // * Branches below _max_depth/2 will be moved up top by inserting nodes with
106  // 'inverse' probes in the 'inverse' branch, therefore the _inverse_path
107  // list is maintained with the SpeciesIDs of the 'inverse path'.
108  // - SpeciesIDs between _parent_ID and current ID are 'missing' in the path
109  // and are appended to the _inverse_path list
110  // - SpeciesIDs greater than the current ID are implicit in the
111  // 'inverse path' list and therefore not stored
112 
113  //
114  // get SpeciesID
115  //
116  GBDATA *data = GB_entry(_ARB_node, "num");
117  const char *buffer = GB_read_char_pntr(data);
118  SpeciesID id = atoi(buffer);
119 
120  //
121  // get probe(s)
122  //
123  PS_ProbeSetPtr probes = NULp;
124  GBDATA *ARB_group = GB_entry(_ARB_node, "group"); // access probe-group
125  if (ARB_group) {
126  data = PG_get_first_probe(ARB_group); // get first probe if exists
127 
128  if (data) probes = new PS_ProbeSet; // new probe set if probes exist
129 
130  while (data) {
131  buffer = PG_read_probe(data); // get probe string
132  PS_ProbePtr new_probe(new PS_Probe); // make new probe
133  new_probe->length = __PROBE_LENGTH; // set probe length
134  new_probe->quality = 100; // set probe quality
135  new_probe->GC_content = 0; // eval probe for GC-content
136  for (int i=0; i < __PROBE_LENGTH; ++i) {
137  if ((buffer[i] == 'C') || (buffer[i] == 'G')) ++(new_probe->GC_content);
138  }
139  probes->insert(new_probe); // append probe to probe set
140  data = PG_get_next_probe(data); // get next probe
141  }
142  }
143 
144  //
145  // enlarge inverse path
146  //
147  for (int i=_parent_ID+1; ((i < id) && (i >= 0)); ++i) {
148  _inverse_path->push_back(i);
149  }
150 
151  //
152  // insertion if ARB_node had probes
153  //
154  if (probes) {
155  if (_depth <= (_max_depth >> 1)) {
156  //
157  // insert if 'above' half depth
158  //
159  PS_NodePtr current_node = PS_assert_path(id, _inverse_path);
160  current_node->addProbes(probes->begin(), probes->end());
161  }
162  else {
163  //
164  // insert if 'below' half depth
165  //
166  PS_NodePtr current_node = PS_assert_inverse_path(_max_depth, id, _inverse_path);
167  current_node->addProbesInverted(probes->begin(), probes->end());
168  }
169  }
170 
171  //
172  // child(ren)
173  //
174  GBDATA *ARB_child = PS_get_first_node(_ARB_node); // get first child if exists
175  while (ARB_child) {
176  PS_extract_probe_data(ARB_child, _max_depth, _depth+1, id, _inverse_path);
177  ARB_child = PS_get_next_node(ARB_child);
178  }
179 
180  //
181  // shrink inverse path
182  //
183  while ((_inverse_path->back() > _parent_ID) && (!_inverse_path->empty())) {
184  _inverse_path->pop_back();
185  }
186 }
187 
188 
189 int main(int _argc, char *_argv[]) {
190  GBDATA *gb_main = NULp;
191  GB_ERROR error = NULp;
192 
193  // open probe-group-database
194  if (_argc < 2) {
195  printf("Missing arguments\n Usage %s <input database name>\n", _argv[0]);
196  printf("output database will be named like input database but with the suffix '.wf' instead of '.arb'\n");
197  exit(1);
198  }
199 
200  const char *DB_name = _argv[1];
201 
202  //
203  // open and check ARB database
204  //
205  struct tms before;
206  times(&before);
207  printf("Opening probe-group-database '%s'..\n ", DB_name);
208  gb_main = GB_open(DB_name, "rwcN");
209  if (!gb_main) {
210  error = GB_await_error();
211  GB_warning(error);
212  exit(1);
213  }
214  printf("..loaded database (enter to continue) ");
215  PS_print_time_diff(&before);
216 
217  GB_transaction ta(gb_main);
218  GBDATA *group_tree = GB_entry(gb_main, "group_tree");
219  if (!group_tree) {
220  printf("no 'group_tree' in database\n");
221  error = GB_export_error("no 'group_tree' in database"); // @@@ error is unused
222  exit(1);
223  }
224  GBDATA *first_level_node = PS_get_first_node(group_tree);
225  if (!first_level_node) {
226  printf("no 'node' found in group_tree\n");
227  error = GB_export_error("no 'node' found in group_tree"); // @@@ error is unused
228  exit(1);
229  }
230 
231  //
232  // read Name <-> ID mappings
233  //
234  times(&before);
235  printf("init Species <-> ID - Map\n");
236  PG_initSpeciesMaps(gb_main);
237  int species_count = PG_NumberSpecies();
238  printf("%i species in the map ", species_count);
239  if (species_count >= 10) {
240  printf("\nhere are the first 10 :\n");
241  int count = 0;
242  for (ID2NameMapCIter i=__ID2NAME_MAP.begin(); count<10; ++i, ++count) {
243  printf("[ %2i ] %s\n", i->first, i->second.c_str());
244  }
245  }
246  __MIN_ID = __ID2NAME_MAP.begin()->first;
247  __MAX_ID = __ID2NAME_MAP.rbegin()->first;
248  printf("IDs %i .. %i\n(enter to continue) ", __MIN_ID, __MAX_ID);
249  PS_print_time_diff(&before);
250 
251  //
252  // create output database
253  //
254  string output_DB_name(DB_name);
255  size_t suffix_pos = output_DB_name.rfind(".arb");
256  if (suffix_pos != string::npos) {
257  output_DB_name.erase(suffix_pos);
258  }
259  output_DB_name.append(".wf");
260  if (suffix_pos == string::npos) {
261  printf("cannot find suffix '.arb' in database name '%s'\n", DB_name);
262  printf("output file will be named '%s'\n", output_DB_name.c_str());
263  }
264  PS_Database *ps_db = new PS_Database(output_DB_name.c_str(), PS_Database::WRITEONLY);
265 
266  //
267  // copy mappings
268  //
269  ps_db->setMappings(__NAME2ID_MAP, __ID2NAME_MAP);
270 
271  //
272  // extract data from ARB database
273  //
274  times(&before);
275  printf("extracting probe-data...\n");
276  PS_detect_probe_length(group_tree);
277  printf("probe_length = %d\n", __PROBE_LENGTH);
278 
279  __ROOT = ps_db->getRootNode();
280  first_level_node = PS_get_first_node(group_tree);
281  unsigned int c = 0;
282  IDVector *inverse_path = new IDVector;
283  struct tms before_first_level_node;
284  for (; first_level_node; ++c) {
285  if (c % 100 == 0) {
286  times(&before_first_level_node);
287  printf("1st level node #%u ", c+1);
288  }
289  PS_extract_probe_data(first_level_node, species_count, 0, __MIN_ID-1, inverse_path);
290  first_level_node = PS_get_next_node(first_level_node);
291  if (c % 100 == 0) {
292  PS_print_time_diff(&before_first_level_node, "this node ", " ");
293  PS_print_time_diff(&before, "total ", "\n");
294  }
295  }
296  printf("done after %u 1st level nodes\n", c);
297  printf("(enter to continue) ");
298  PS_print_time_diff(&before);
299 
300  //
301  // write database to file
302  //
303  times(&before);
304  printf("writing probe-data to %s..\n", output_DB_name.c_str());
305  ps_db->save();
306  printf("..done saving (enter to continue) ");
307  PS_print_time_diff(&before);
308 
309  delete inverse_path;
310  delete ps_db;
311  before.tms_utime = 0;
312  before.tms_stime = 0;
313  printf("total ");
314  PS_print_time_diff(&before);
315  return 0;
316 }
GBDATA * PG_get_first_probe(GBDATA *pb_group)
GBDATA * GB_open(const char *path, const char *opent)
Definition: ad_load.cxx:1363
IDVector::const_iterator IDVectorCIter
Definition: ps_defs.hxx:34
GBDATA * PG_get_next_probe(GBDATA *pb_probe)
void GB_warning(const char *message)
Definition: arb_msg.cxx:530
static SpeciesID __MAX_ID
void PS_print_time_diff(const struct tms *_since, const char *_before, const char *_after)
Definition: ps_tools.cxx:21
static Name2IDMap __NAME2ID_MAP
void PS_extract_probe_data(GBDATA *_ARB_node, int _max_depth, int _depth, const int _parent_ID, IDVector *_inverse_path)
static void PS_detect_probe_length(GBDATA *_ARB_node)
static SpeciesID __MIN_ID
PS_NodePtr assertChild(SpeciesID _id)
Definition: ps_node.hxx:89
void addProbes(PS_ProbeSetCIter _begin, PS_ProbeSetCIter _end)
Definition: ps_node.hxx:138
static ID2NameMap __ID2NAME_MAP
char buffer[MESSAGE_BUFFERSIZE]
Definition: seq_search.cxx:34
void addProbesInverted(PS_ProbeSetCIter _begin, PS_ProbeSetCIter _end)
Definition: ps_node.hxx:146
static PS_NodePtr __ROOT
GB_ERROR GB_export_error(const char *error)
Definition: arb_msg.cxx:257
GB_ERROR GB_await_error()
Definition: arb_msg.cxx:342
static GB_ERROR PG_initSpeciesMaps(GBDATA *pb_main)
PS_ProbeSet * PS_ProbeSetPtr
Definition: ps_node.hxx:50
GBDATA * PS_get_first_node(GBDATA *pb_nodecontainer)
static void error(const char *msg)
Definition: mkptypes.cxx:96
GBDATA * PS_get_next_node(GBDATA *pb_node)
std::vector< SpeciesID > IDVector
Definition: ps_defs.hxx:32
ID2NameMap::const_iterator ID2NameMapCIter
Definition: ps_defs.hxx:27
#define NULp
Definition: cxxforward.h:116
GB_transaction ta(gb_var)
PS_NodePtr PS_assert_path(const int _caller_ID, IDVector *_path)
GB_CSTR GB_read_char_pntr(GBDATA *gbd)
Definition: arbdb.cxx:904
GBDATA * gb_main
Definition: adname.cxx:32
static int PG_NumberSpecies()
int main(int _argc, char *_argv[])
set< PS_ProbePtr, lt_probe > PS_ProbeSet
Definition: ps_node.hxx:49
PS_NodePtr PS_assert_inverse_path(const int _max_depth, const int _caller_ID, IDVector *_path)
const char * PG_read_probe(GBDATA *pb_probe)
GBDATA * GB_entry(GBDATA *father, const char *key)
Definition: adquery.cxx:334
static int __PROBE_LENGTH