ARB
ps_get_probes.cxx
Go to the documentation of this file.
1 // =============================================================== //
2 // //
3 // File : ps_get_probes.cxx //
4 // Purpose : //
5 // //
6 // Coded by Wolfram Foerster in October 2002 //
7 // Institute of Microbiology (Technical University Munich) //
8 // http://www.arb-home.de/ //
9 // //
10 // =============================================================== //
11 
12 #include "ps_pg_tree_functions.hxx"
13 #include "ps_filebuffer.hxx"
14 
15 #include <sys/times.h>
16 
17 // GLOBALS
18 
19 static char *__ARB_DB_NAME = NULp;
20 static GBDATA *__ARB_DB = NULp;
23 
24 
25 bool PS_get_probe_for_path(IDSet& _path, unsigned int _GC_content, unsigned int _probe_length, char *_probe_data) {
26  GB_transaction ta(__ARB_DB);
27  SpeciesID num;
28  GBDATA *data;
29  GBDATA *arb_node;
30  GBDATA *arb_group;
31  IDSetCIter id;
32  unsigned int gc_content;
33 
34  // iterate over path
35  arb_node = __ARB_GROUP_TREE;
36  for (id = _path.begin();
37  (id != _path.end()) && arb_node;
38  ++id) {
39 
40  // search for arb-node with num == id
41  arb_node = PS_get_first_node(arb_node); if (!arb_node) break;
42  data = GB_entry(arb_node, "num");
43  num = atoi(GB_read_char_pntr(data));
44  while (num != *id) {
45  // get next node
46  arb_node = PS_get_next_node(arb_node);
47  if (!arb_node) break;
48  // get num of arb-node
49  data = GB_entry(arb_node, "num");
50  num = atoi(GB_read_char_pntr(data));
51  }
52  if (!arb_node) break;
53  }
54  if (!arb_node) {
55  printf(" ERROR : failed to get node for ID (%i)\n", *id);
56  return false;
57  }
58 
59  // search for probe with GC-content == _GC_content
60  arb_group = GB_entry(arb_node, "group");
61  if (!arb_group) {
62  printf(" ERROR : failed to get group of node");
63  return false;
64  }
65  data = PG_get_first_probe(arb_group);
66  if (!data) {
67  printf(" ERROR : failed to get first probe of group of node");
68  return false;
69  }
70  while (_probe_data[0] == '\x00') {
71  const char *buffer = PG_read_probe(data); // read probe data
72  gc_content = 0;
73  for (unsigned int i = 0; i < _probe_length; ++i) { // calc GC-content
74  if ((buffer[i] == 'C') || (buffer[i] == 'G')) ++gc_content;
75  }
76  if (gc_content == _GC_content) { // found matching probe ?
77  for (unsigned int i = 0; i < _probe_length; ++i) { // store probe data
78  _probe_data[i] = buffer[i];
79  }
80  }
81  else {
82  data = PG_get_next_probe(data); // get next probe
83  if (!data) break;
84  }
85  }
86  if (!data) {
87  printf(" ERROR : failed to find probe with GC-content (%u)\n", _GC_content);
88  return false;
89  }
90 
91  return true;
92 }
93 
94 
95 // ====================================================
96 // ====================================================
97 int main(int argc,
98  char *argv[]) {
99  //
100  // check arguments
101  //
102  if (argc < 3) {
103  printf("Missing argument\n Usage %s <final candidates paths filename> <prefix to arb-databases>\n", argv[0]);
104  printf("Example:\n %s ~/data/850.final_candidates.paths ~/data/ssjun03_Eucarya_850.pg_\n", argv[0]);
105  exit(1);
106  }
107 
108  char *final_candidates_paths_filename = argv[1];
109  char *arb_db_name_prefix = argv[2];
110  char *temp_filename = (char *)malloc(strlen(final_candidates_paths_filename)+1+5);
111  strcpy(temp_filename, final_candidates_paths_filename);
112  strcat(temp_filename, ".temp");
113  unlink(temp_filename);
114  printf("Opening temp-file '%s'..\n", temp_filename);
115  PS_FileBuffer *temp__file = new PS_FileBuffer(temp_filename, PS_FileBuffer::WRITEONLY);
116 
117  //
118  // candidates
119  //
120  printf("Opening candidates-paths-file '%s'..\n", final_candidates_paths_filename);
121  PS_FileBuffer *paths_file = new PS_FileBuffer(final_candidates_paths_filename, PS_FileBuffer::READONLY);
122  unsigned long paths_todo;
123  unsigned int probe_length_todo = 0;
124  unsigned int probe_buffer_length = 100;
125  char *probe_buffer = NULp;
126 
127  // read count of paths
128  paths_file->get_ulong(paths_todo);
129  temp__file->put_ulong(paths_todo);
130 
131  // read used probe lengths
132  unsigned int count;
133  set<unsigned int> probe_lengths;
134  paths_file->get_uint(count);
135  temp__file->put_uint(count);
136  printf("probe lengths :");
137  for (unsigned int i = 0; i < count; ++i) {
138  unsigned int length;
139  paths_file->get_uint(length);
140  temp__file->put_uint(length);
141  probe_lengths.insert(length);
142  printf(" %u", length);
143  }
144  printf("\n");
145 
146  // read candidates
147  while (paths_todo) {
148  printf("\npaths todo (%lu)\n", paths_todo--);
149  unsigned int probe_length;
150  unsigned int probe_GC_content;
151  unsigned int path_length;
152  IDSet path;
153  SpeciesID id;
154 
155  // read one candidate
156  paths_file->get_uint(probe_length);
157  temp__file->put_uint(probe_length);
158  paths_file->get_uint(probe_GC_content);
159  temp__file->put_uint(probe_GC_content);
160  printf(" probe length (%u) GC (%u)\n", probe_length, probe_GC_content);
161  paths_file->get_uint(path_length);
162  temp__file->put_uint(path_length);
163  printf(" path size (%u) ( ", path_length);
164  for (unsigned int i = 0; i < path_length; ++i) {
165  paths_file->get_int(id);
166  temp__file->put_int(id);
167  path.insert(id);
168  printf("%i ", id);
169  }
170  printf(")\n");
171  if (!probe_buffer || (probe_length > probe_buffer_length)) {
172  if (probe_buffer) { // adjust buffer size
173  free(probe_buffer);
174  probe_buffer_length = 2 * probe_length;
175  }
176  probe_buffer = (char*)calloc(probe_buffer_length, sizeof(char));
177  }
178  paths_file->get(probe_buffer, probe_length);
179  probe_buffer[probe_length] = '\x00';
180 
181  // handle probe
182  if (probe_buffer[0] == '\x00') {
183  if (!probe_length_todo) {
184  printf("handling probes of length %u this time\n", probe_length);
185  probe_length_todo = probe_length;
186  // init. arb-db-filename
187  // +1 for \0, +7 for 'tmp.arb', +5 for max number of digits of unsigned int
188  __ARB_DB_NAME = (char*)malloc(strlen(arb_db_name_prefix)+1+7+5);
189  sprintf(__ARB_DB_NAME, "%s%utmp.arb", arb_db_name_prefix, probe_length);
190  printf("Opening ARB-Database '%s'..\n ", __ARB_DB_NAME);
191  __ARB_DB = GB_open(__ARB_DB_NAME, "rN");
192  if (!__ARB_DB) {
193  __ARB_ERROR = GB_await_error();
194  printf("%s\n", __ARB_ERROR);
195  return 1;
196  }
197  GB_transaction ta(__ARB_DB);
198  __ARB_GROUP_TREE = GB_entry(__ARB_DB, "group_tree");
199  if (!__ARB_GROUP_TREE) {
200  printf("no 'group_tree' in database\n");
201  return 1;
202  }
203  GBDATA *first_level_node = PS_get_first_node(__ARB_GROUP_TREE);
204  if (!first_level_node) {
205  printf("no 'node' found in group_tree\n");
206  return 1;
207  }
208  }
209  if (probe_length_todo == probe_length) {
210  if (!PS_get_probe_for_path(path, probe_GC_content, probe_length, probe_buffer)) {
211  delete temp__file;
212  return 1;
213  }
214  printf(" probe data (%s) ==> updated\n", probe_buffer);
215  }
216  else {
217  printf(" probe data (%s) --> skipped\n", probe_buffer);
218  }
219  }
220  else {
221  printf(" probe data (%s) --> finished\n", probe_buffer);
222  }
223  temp__file->put(probe_buffer, probe_length);
224  }
225  probe_lengths.clear();
226  paths_file->get_uint(count);
227  for (unsigned int i = 0; i < count; ++i) {
228  unsigned int length;
229  paths_file->get_uint(length);
230  if (length != probe_length_todo) probe_lengths.insert(length);
231  }
232  printf("remaining probe lengths :");
233  temp__file->put_uint(probe_lengths.size());
234  for (set<unsigned int>::iterator length = probe_lengths.begin();
235  length != probe_lengths.end();
236  ++length) {
237  temp__file->put_uint(*length);
238  printf(" %u", *length);
239  }
240  printf("\n");
241  if (probe_buffer) free(probe_buffer);
242  if (__ARB_DB_NAME) free(__ARB_DB_NAME);
243 
244  printf("cleaning up... temp-file\n"); fflush(stdout);
245  delete temp__file;
246  printf("cleaning up... candidates-paths-file\n"); fflush(stdout);
247  delete paths_file;
248  printf("moving temp-file to candiates-paths-file\n"); fflush(stdout);
249  rename(temp_filename, final_candidates_paths_filename);
250 
251  // exit code == 0 if all probe lengths handled
252  // exit code == 1 if failure
253  // exit code >= 2 else
254  return probe_lengths.size()*2;
255 }
void put_ulong(unsigned long int _ul)
void get(void *_data, int _length)
static const bool READONLY
GBDATA * PG_get_first_probe(GBDATA *pb_group)
GBDATA * GB_open(const char *path, const char *opent)
Definition: ad_load.cxx:1363
GBDATA * PG_get_next_probe(GBDATA *pb_probe)
const char * id
Definition: AliAdmin.cxx:17
bool PS_get_probe_for_path(IDSet &_path, unsigned int _GC_content, unsigned int _probe_length, char *_probe_data)
std::set< SpeciesID > IDSet
Definition: ps_defs.hxx:39
void put_int(int _i)
char buffer[MESSAGE_BUFFERSIZE]
Definition: seq_search.cxx:34
void put_uint(unsigned int _ui)
void get_uint(unsigned int &_ui)
GB_ERROR GB_await_error()
Definition: arb_msg.cxx:342
fflush(stdout)
GBDATA * PS_get_first_node(GBDATA *pb_nodecontainer)
static char * __ARB_DB_NAME
void put(const void *_data, int _length)
void get_int(int &_i)
void get_ulong(unsigned long int &_ul)
GBDATA * PS_get_next_node(GBDATA *pb_node)
static GBDATA * __ARB_DB
int main(int argc, char *argv[])
static GB_ERROR __ARB_ERROR
#define NULp
Definition: cxxforward.h:116
GB_transaction ta(gb_var)
static GBDATA * __ARB_GROUP_TREE
GB_CSTR GB_read_char_pntr(GBDATA *gbd)
Definition: arbdb.cxx:904
size_t length
static const bool WRITEONLY
const char * PG_read_probe(GBDATA *pb_probe)
GBDATA * GB_entry(GBDATA *father, const char *key)
Definition: adquery.cxx:334
IDSet::const_iterator IDSetCIter
Definition: ps_defs.hxx:41