ARB
GDE_FileIO.cxx
Go to the documentation of this file.
1 #include "GDE_proto.h"
2 
3 #include <aw_msg.hxx>
4 
5 #include <climits>
6 #include <algorithm>
7 
8 static void Regroup(NA_Alignment& alignment) {
9  size_t j;
10  size_t group;
11  int last;
12 
13  for (j=0; j<alignment.numelements; j++) {
14  alignment.element[j].groupf = NULp;
15  alignment.element[j].groupb = NULp;
16  }
17 
18  for (group = 1; group <= alignment.numgroups; group++) {
19  last = -1;
20  for (j=0; j<alignment.numelements; j++)
21  if (alignment.element[j].groupid == group) {
22  if (last != -1) {
23  alignment.element[j].groupb = &(alignment.element[last]);
24  alignment.element[last].groupf = &(alignment.element[j]);
25  }
26  last = j;
27  }
28  }
29 }
30 
31 static void NormalizeOffset(NA_Alignment& aln) {
32  size_t j;
33  int offset = INT_MAX;
34 
35  for (j=0; j<aln.numelements; j++)
36  offset = std::min(offset, aln.element[j].offset);
37 
38  for (j=0; j<aln.numelements; j++)
39  aln.element[j].offset -= offset;
40 
41  aln.maxlen = INT_MIN;
42  for (j=0; j<aln.numelements; j++)
43  aln.maxlen = std::max(aln.element[j].seqlen+aln.element[j].offset, aln.maxlen);
44 
45  gde_assert(aln.maxlen >= 0);
46 
47  aln.rel_offset += offset;
48 
49  if (aln.numelements == 0) aln.rel_offset = 0;
50 }
51 
52 static void ReadNA_Flat(char *filename, NA_Alignment& data) {
53  size_t j;
54  int jj, curelem=0, offset;
55  char buffer[GBUFSIZ];
56  char in_line[GBUFSIZ];
57 
58  NA_Sequence *this_elem;
59 
60  FILE *file = fopen(filename, "r");
61  if (!file) {
62  fprintf(stderr, "Cannot open %s.\n", filename);
63  }
64  else {
65  for (; fgets(in_line, GBUFSIZ, file);) {
66  if (in_line[0] == '#' ||
67  in_line[0] == '%' ||
68  in_line[0] == '"' ||
69  in_line[0] == '@')
70  {
71  offset = 0;
72  for (j=0; j<strlen(in_line); j++) {
73  if (in_line[j] == '(') {
74  sscanf((char*)&(in_line[j+1]), "%d", &offset);
75  in_line[j] = '\0';
76  }
77  }
78 
79  curelem = Arbdb_get_curelem(data);
80 
81  InitNASeq(&(data.element[curelem]),
82  in_line[0] == '#' ? DNA :
83  in_line[0] == '%' ? PROTEIN :
84  in_line[0] == '"' ? TEXT :
85  in_line[0] == '@' ? MASK : TEXT);
86  this_elem = &(data.element[curelem]);
87  if (in_line[strlen(in_line)-1] == '\n')
88  in_line[strlen(in_line)-1] = '\0';
89  strcpy_truncate(this_elem->short_name, in_line+1, SIZE_SHORT_NAME);
90  this_elem->offset = offset;
91  }
92  else if (in_line[0] != '\n') {
93  size_t strl = strlen(in_line);
94  for (j=0, jj=0; j<strl; j++) {
95  if (in_line[j] != ' ' && in_line[j] != '\n' && in_line[j] != '\t') {
96  buffer[jj++] = in_line[j];
97  }
98  }
99 
100  if (data.element[curelem].rmatrix) {
101  Ascii2NA(buffer, jj, data.element[curelem].rmatrix);
102  }
103  AppendNA((NA_Base*)buffer, jj, &(data.element[curelem]));
104  }
105  }
106  fclose(file);
107 
108  for (j=0; j<data.numelements; j++) {
109  data.maxlen = std::max(data.maxlen, data.element[j].seqlen + data.element[j].offset);
110  }
111 
112  NormalizeOffset(data);
113  Regroup(data);
114  }
115 }
116 
117 /*
118  LoadFile():
119  Load the given filename into the given dataset. Handle any
120  type conversion needed to get the data into the specified data type.
121  This routine is used in situations where the format and datatype is known.
122 
123  Copyright (c) 1989-1990, University of Illinois board of trustees. All
124  rights reserved. Written by Steven Smith at the Center for Prokaryote Genome
125  Analysis. Design and implementation guidance by Dr. Gary Olsen and Dr.
126  Carl Woese.
127 
128  Copyright (c) 1990,1991,1992 Steven Smith at the Harvard Genome Laboratory.
129  All rights reserved.
130 */
131 
132 static GB_ERROR LoadFile(char *filename, NA_Alignment& dataset, int type, int format) {
133  GB_ERROR error = NULp;
134  if (DataType != type)
135  fprintf(stderr, "Warning, datatypes do not match.\n");
136  /*
137  Handle the overwrite/create/merge dialog here.
138  */
139  switch (format) {
140  case NA_FLAT:
141  ReadNA_Flat(filename, dataset);
142  dataset.format = GDE;
143  break;
144 
145  case GENBANK:
146  error = ReadGen(filename, dataset);
147  dataset.format = GENBANK;
148  break;
149 
150  case GDE:
151  gde_assert(0); // should no longer occur
152  break;
153 
154  default:
155  break;
156  }
157  return error;
158 }
159 
160 static int FindType(char *name, int *dtype, int *ftype) {
161  FILE *file = fopen(name, "r");
162 
163  *dtype = 0;
164  *ftype = 0;
165 
166  int result = 1;
167  if (file) {
168  /* Is this a flat file?
169  * Get the first non blank line, see if a type marker shows up.
170  */
171  char in_line[GBUFSIZ];
172  if (fgets(in_line, GBUFSIZ, file)) {
173  for (; strlen(in_line)<2 && fgets(in_line, GBUFSIZ, file);) ;
174 
175  if (in_line[0] == '#' || in_line[0] == '%' ||
176  in_line[0] == '"' || in_line[0] == '@')
177  {
178  *dtype=NASEQ_ALIGN;
179  *ftype=NA_FLAT;
180  result = 0;
181  }
182  else { // try genbank
183  fclose(file);
184  file = fopen(name, "r");
185  *dtype=0;
186  *ftype=0;
187 
188  if (file) {
189  for (; fgets(in_line, GBUFSIZ, file);) {
190  if (Find(in_line, "LOCUS")) {
191  *dtype = NASEQ_ALIGN;
192  *ftype = GENBANK;
193  break;
194  }
195  else if (Find(in_line, "sequence")) { // try GDE
196  *dtype = NASEQ_ALIGN;
197  *ftype = GDE;
198  break;
199  }
200  }
201  result = 0;
202  }
203  }
204  }
205  fclose(file);
206  }
207 
208  return result;
209 }
210 
211 void LoadData(char *filen, NA_Alignment& dataset) {
212  /* LoadData():
213  * Load a data set from the command line argument.
214  *
215  * Copyright (c) 1989, University of Illinois board of trustees. All rights
216  * reserved. Written by Steven Smith at the Center for Prokaryote Genome
217  * Analysis. Design and implementation guidance by Dr. Gary Olsen and Dr.
218  * Carl Woese.
219  * Copyright (c) 1990,1991,1992 Steven Smith at the Harvard Genome Laboratory.
220  * All rights reserved.
221  */
222 
223  // Get file name, determine the file type, and away we go..
224  if (Find2(filen, "gde") != 0)
225  strcpy(FileName, filen);
226 
227  FILE *file = fopen(filen, "r");
228  if (file) {
229  FindType(filen, &DataType, &FileFormat);
230  switch (DataType) {
231  case NASEQ_ALIGN: {
232  GB_ERROR error = LoadFile(filen, dataset, DataType, FileFormat);
233  if (error) aw_message(error);
234  break;
235  }
236  default:
237  aw_message(GBS_global_string("Internal error: unknown file type of file %s", filen));
238  break;
239  }
240  fclose(file);
241  }
242 }
243 
244 void AppendNA(NA_Base *buffer, int len, NA_Sequence *seq) {
245  int curlen=0, j;
246  if (seq->seqlen+len >= seq->seqmaxlen) {
247  seq->seqmaxlen = seq->seqlen + len + GBUFSIZ;
248  ARB_realloc(seq->sequence, seq->seqmaxlen);
249  }
250  // seqlen is the length, and the index of the next free base
251  curlen = seq->seqlen + seq->offset;
252  for (j=0; j<len; j++)
253  putelem(seq, j+curlen, buffer[j]);
254 
255  seq->seqlen += len;
256  return;
257 }
258 
259 void Ascii2NA(char *buffer, int len, int matrix[16]) {
260  // if the translation matrix exists, use it to encode the buffer.
261  if (matrix) {
262  for (int i=0; i<len; i++) {
263  buffer[i] = matrix[(unsigned char)buffer[i]];
264  }
265  }
266 }
267 
268 int WriteNA_Flat(NA_Alignment& aln, char *filename, int method) {
269  if (aln.numelements == 0) return 1;
270 
271  size_t j;
272  int kk;
273  int k, offset;
274  char offset_str[100], buf[100];
275  FILE *file;
276 
277  NA_Sequence *seqs = aln.element;
278 
279  file = fopen(filename, "w");
280  if (!file) {
281  Warning("Cannot open file for output");
282  return 1;
283  }
284 
285  for (j=0; j<aln.numelements; j++) {
286  offset = seqs[j].offset;
287 
288  if (offset+aln.rel_offset != 0)
289  sprintf(offset_str, "(%d)", offset+aln.rel_offset);
290  else
291  offset_str[0] = '\0';
292 
293  if (method == ALL) {
294  fprintf(file, "%c%s%s\n",
295  seqs[j].elementtype == DNA ? '#' :
296  seqs[j].elementtype == RNA ? '#' :
297  seqs[j].elementtype == PROTEIN ? '%' :
298  seqs[j].elementtype == TEXT ? '"' :
299  seqs[j].elementtype == MASK ? '@' : '"',
300  seqs[j].short_name,
301  (offset+aln.rel_offset == 0) ? "" : offset_str);
302 
303  if (seqs[j].tmatrix) {
304  for (k=0, kk=0; kk<seqs[j].seqlen; kk++) {
305  if ((k)%60 == 0 && k>0) {
306  buf[60] = '\0';
307  fputs(buf, file);
308  putc('\n', file);
309  }
310  buf[k%60] = ((char)seqs[j].tmatrix[(int)getelem(&(seqs[j]), kk+offset)]);
311  k++;
312  }
313  }
314  else {
315  for (k=0, kk=0; kk<seqs[j].seqlen; kk++) {
316  if ((k)%60 == 0 && k>0) {
317  buf[60] = '\0';
318  fputs(buf, file);
319  putc('\n', file);
320  }
321  buf[k%60] = (getelem(&(seqs[j]), kk+offset));
322  k++;
323  }
324  }
325  buf[(k%60)>0 ? (k%60) : 60] = '\0';
326  fputs(buf, file);
327  putc('\n', file);
328  }
329  }
330  fclose(file);
331  return 0;
332 }
333 
334 void Warning(const char *s) {
335  aw_message(s);
336 }
337 
338 
340  SetTime(&(seq->t_stamp.origin));
341  SetTime(&(seq->t_stamp.modify));
343  seq->seq_name[0] = '\0';
344  seq->barcode[0] = '\0';
345  seq->contig[0] = '\0';
346  seq->membrane[0] = '\0';
347  seq->authority[0] = '\0';
348  seq->short_name[0] = '\0';
349  seq->sequence = NULp;
350  seq->offset = 0;
351  seq->baggage = NULp;
352  seq->baggage_len = 0;
353  seq->baggage_maxlen = 0;
354  seq->comments = NULp;
355  seq->comments_len = 0;
356  seq->comments_maxlen = 0;
357  seq->description[0] = '\0';
358  seq->seqlen = 0;
359  seq->seqmaxlen = 0;
360  seq->attr = IS_5_TO_3 + IS_PRIMARY;
361  seq->elementtype = type;
362  seq->groupid = 0;
363  seq->groupb = NULp;
364  seq->groupf = NULp;
365 
366  switch (type) {
367  case DNA:
368  seq->tmatrix = Default_DNA_Trans;
369  seq->rmatrix = Default_NA_RTrans;
371  break;
372  case RNA:
373  seq->tmatrix = Default_RNA_Trans;
374  seq->rmatrix = Default_NA_RTrans;
376  break;
377  case PROTEIN:
378  seq->tmatrix = NULp;
379  seq->rmatrix = NULp;
381  break;
382  case MASK:
383  case TEXT:
384  default:
385  seq->tmatrix = NULp;
386  seq->rmatrix = NULp;
387  seq->col_lut = NULp;
388  break;
389  }
390  return;
391 }
392 
int DataType
Definition: GDE_global.cxx:11
#define MASK
Definition: GDE_def.h:44
int Default_NAColor_LKUP[]
Definition: GDE_global.cxx:58
const char * GB_ERROR
Definition: arb_core.h:25
void LoadData(char *filen, NA_Alignment &dataset)
Definition: GDE_FileIO.cxx:211
char short_name[SIZE_SHORT_NAME]
Definition: GDE_def.h:86
string result
GB_TYPES type
int elementtype
Definition: GDE_def.h:107
static void ReadNA_Flat(char *filename, NA_Alignment &data)
Definition: GDE_FileIO.cxx:52
AliDataPtr format(AliDataPtr data, const size_t wanted_len, GB_ERROR &error)
Definition: insdel.cxx:615
int seqmaxlen
Definition: GDE_def.h:99
#define NASEQ_ALIGN
Definition: GDE_def.h:38
#define IS_5_TO_3
Definition: GDE_def.h:48
char seq_name[SIZE_SEQ_NAME]
Definition: GDE_def.h:85
#define DNA
Definition: GDE_def.h:41
int baggage_len
Definition: GDE_def.h:109
#define PROTEIN
Definition: GDE_def.h:45
TimeStamp t_stamp
Definition: GDE_def.h:96
int seqlen
Definition: GDE_def.h:98
int format
Definition: GDE_def.h:127
#define RNA
Definition: GDE_def.h:42
static void Regroup(NA_Alignment &alignment)
Definition: GDE_FileIO.cxx:8
NA_Sequence * element
Definition: GDE_def.h:124
const char * GBS_global_string(const char *templat,...)
Definition: arb_msg.cxx:203
struct TimeStamp::@1 modify
int Default_DNA_Trans[]
Definition: GDE_global.cxx:33
int * col_lut
Definition: GDE_def.h:102
int attr
Definition: GDE_def.h:100
int Default_RNA_Trans[]
Definition: GDE_global.cxx:22
NA_Sequence * groupb
Definition: GDE_def.h:104
char buffer[MESSAGE_BUFFERSIZE]
Definition: seq_search.cxx:34
FILE * seq
Definition: rns.c:46
int * rmatrix
Definition: GDE_def.h:111
int Default_NA_RTrans[]
Definition: GDE_global.cxx:44
char contig[80]
Definition: GDE_def.h:88
#define GBUFSIZ
Definition: GDE_def.h:14
void putelem(NA_Sequence *a, int b, NA_Base c)
#define TEXT
Definition: GDE_def.h:43
NA_Base * sequence
Definition: GDE_def.h:95
void strcpy_truncate(char *dest, const char *source, size_t dest_size)
Definition: GDE_def.h:137
void AppendNA(NA_Base *buffer, int len, NA_Sequence *seq)
Definition: GDE_FileIO.cxx:244
void SetTime(void *b)
static int group[MAXN+1]
Definition: ClustalV.cxx:65
char * uniqueID()
char FileName[80]
Definition: GDE_global.cxx:13
#define GDE
Definition: GDE_def.h:29
static void error(const char *msg)
Definition: mkptypes.cxx:96
#define SIZE_ID
Definition: GDE_def.h:77
struct TimeStamp::@1 origin
static void NormalizeOffset(NA_Alignment &aln)
Definition: GDE_FileIO.cxx:31
static int FindType(char *name, int *dtype, int *ftype)
Definition: GDE_FileIO.cxx:160
#define SIZE_SHORT_NAME
Definition: GDE_def.h:79
int Find2(const char *target, const char *key)
size_t groupid
Definition: GDE_def.h:101
GB_ERROR ReadGen(char *filename, NA_Alignment &dataset)
Definition: GDE_Genbank.cxx:64
Definition: fun.h:13
char * baggage
Definition: GDE_def.h:108
void Warning(const char *s)
Definition: GDE_FileIO.cxx:334
int comments_len
Definition: GDE_def.h:93
char membrane[80]
Definition: GDE_def.h:89
bool Find(const char *target, const char *key)
fputs(TRACE_PREFIX, stderr)
int comments_maxlen
Definition: GDE_def.h:93
unsigned char NA_Base
Definition: GDE_def.h:73
void Ascii2NA(char *buffer, int len, int matrix[16])
Definition: GDE_FileIO.cxx:259
int offset
Definition: GDE_def.h:97
#define NA_FLAT
Definition: GDE_def.h:31
int * tmatrix
Definition: GDE_def.h:110
static GB_ERROR LoadFile(char *filename, NA_Alignment &dataset, int type, int format)
Definition: GDE_FileIO.cxx:132
int Default_PROColor_LKUP[]
Definition: GDE_global.cxx:69
char barcode[80]
Definition: GDE_def.h:87
size_t numelements
Definition: GDE_def.h:120
NA_Sequence * groupf
Definition: GDE_def.h:105
void ARB_realloc(TYPE *&tgt, size_t nelem)
Definition: arb_mem.h:43
int getelem(NA_Sequence *a, int b)
int maxlen
Definition: GDE_def.h:122
void aw_message(const char *msg)
Definition: AW_status.cxx:1142
#define NULp
Definition: cxxforward.h:116
int rel_offset
Definition: GDE_def.h:123
#define offset(field)
Definition: GLwDrawA.c:73
char description[SIZE_DESCRIPTION]
Definition: GDE_def.h:90
int baggage_maxlen
Definition: GDE_def.h:109
#define min(a, b)
Definition: f2c.h:153
char authority[SIZE_AUTHORITY]
Definition: GDE_def.h:91
int FileFormat
Definition: GDE_global.cxx:12
char id[SIZE_ID]
Definition: GDE_def.h:84
size_t numgroups
Definition: GDE_def.h:125
#define gde_assert(bed)
Definition: GDE_def.h:12
#define IS_PRIMARY
Definition: GDE_def.h:51
void InitNASeq(NA_Sequence *seq, int type)
Definition: GDE_FileIO.cxx:339
int Arbdb_get_curelem(NA_Alignment &dataset)
char * comments
Definition: GDE_def.h:92
int WriteNA_Flat(NA_Alignment &aln, char *filename, int method)
Definition: GDE_FileIO.cxx:268
GB_write_int const char s
Definition: AW_awar.cxx:154
#define max(a, b)
Definition: f2c.h:154