ARB
GDE_Genbank.cxx
Go to the documentation of this file.
1 #include "GDE_extglob.h"
2 #include <ctime>
3 #include <algorithm>
4 
5 /*
6  Copyright (c) 1989-1990, University of Illinois board of trustees. All
7  rights reserved. Written by Steven Smith at the Center for Prokaryote Genome
8  Analysis. Design and implementation guidance by Dr. Gary Olsen and Dr.
9  Carl Woese.
10 
11  Copyright (c) 1990,1991,1992 Steven Smith at the Harvard Genome Laboratory.
12  all rights reserved.
13 
14  Copyright (c) 1993, Steven Smith, all rights reserved.
15 
16 */
17 
18 static bool CheckType(char *seq, int len) {
19  /* CheckType: Check base composition to see if the sequence
20  * appears to be an amino acid sequence.
21  */
22  int j, count1 = 0, count2 = 0;
23 
24  for (j=0; j<len; j++) {
25  if (((seq[j]|32) < 'z') && ((seq[j]|32) > 'a')) {
26  count1++;
27  if (!strchr("ACGTUNacgtun", seq[j]))
28  count2++;
29  }
30  }
31 
32  return (count2 > count1/4) ? true : false;
33 }
34 
35 // ARB
36 struct ARB_TIME {
37  int yy;
38  int mm;
39  int dd;
40  int hr;
41  int mn;
42  int sc;
43 };
44 
45 static void AsciiTime(void *b, char *asciitime) {
46  ARB_TIME *a=(ARB_TIME*)b;
47  int j;
48  char temp[GBUFSIZ];
49 
50  a->dd = 0;
51  a->yy = 0;
52  a->mm = 0;
53  sscanf(asciitime, "%d%5c%d", &(a->dd), temp, &(a->yy));
54  temp[5] = '\0';
55  for (j=0; j<12; j++)
56  if (strcmp(temp, GDEmonth[j]) == 0)
57  a->mm = j+1;
58  if (a->dd <0 || a->dd > 31 || a->yy < 0 || a->mm > 11)
59  SetTime(a);
60  return;
61 }
62 // ENDARB
63 
64 GB_ERROR ReadGen(char *filename, NA_Alignment& dataset) {
66  FILE *file = fopen(filename, "r");
67  if (!file) {
68  error = GB_IO_error("reading", filename);
69  }
70  else {
71  bool done = false;
72  size_t len = 0;
73  bool IS_REALLY_AA = false;
74  char in_line[GBUFSIZ];
75  char c;
76  char *buffer = NULp;
77  char *gencomments = NULp;
78  char fields[8][GBUFSIZ];
79  size_t buflen = 0;
80  int genclen = 0;
81  int curelem = 0;
82  int n = 0;
83  int start_col = -1;
84 
85  NA_Sequence *this_elem = NULp;
86 
87  for (; fgets(in_line, GBUFSIZ, file);) {
88  if (in_line[strlen(in_line)-1] == '\n') {
89  in_line[strlen(in_line)-1] = '\0';
90  }
91  if (Find(in_line, "LOCUS")) {
92  curelem = Arbdb_get_curelem(dataset);
93  this_elem = &(dataset.element[curelem]);
94  n = sscanf(in_line, "%s %s %s %s %s %s %s %s",
95  fields[0], fields[1], fields[2], fields[3], fields[4],
96  fields[5], fields[6], fields[7]);
97 
98  if (IS_REALLY_AA) {
99  InitNASeq(this_elem, PROTEIN);
100  }
101  else if (Find(in_line, "DNA")) {
102  InitNASeq(this_elem, DNA);
103  }
104  else if (Find(in_line, "RNA")) {
105  InitNASeq(this_elem, RNA);
106  }
107  else if (Find(in_line, "MASK")) {
108  InitNASeq(this_elem, MASK);
109  }
110  else if (Find(in_line, "TEXT")) {
111  InitNASeq(this_elem, TEXT);
112  }
113  else if (Find(in_line, "PROT")) {
114  InitNASeq(this_elem, PROTEIN);
115  }
116  else {
117  InitNASeq(this_elem, DNA);
118  }
119 
120  strcpy_truncate(this_elem->short_name, fields[1], SIZE_SHORT_NAME);
121  AsciiTime(&(this_elem->t_stamp.origin), fields[n-1]);
122  this_elem->attr = DEFAULT_X_ATTR;
123 
124  if (Find(in_line, "Circular")) {
125  this_elem->attr |= IS_CIRCULAR;
126  }
127  gencomments = NULp;
128  genclen = 0;
129  }
130  else if (Find(in_line, "DEFINITION")) {
131  strcpy_truncate(this_elem->description, in_line+12, SIZE_DESCRIPTION);
132  }
133  else if (Find(in_line, "AUTHOR")) {
134  strcpy_truncate(this_elem->authority, in_line+12, SIZE_AUTHORITY);
135  }
136  else if (Find(in_line, " ORGANISM")) {
137  strcpy_truncate(this_elem->seq_name, in_line+12, SIZE_SEQ_NAME);
138  }
139  else if (Find(in_line, "ACCESSION")) {
140  strcpy_truncate(this_elem->id, in_line+12, SIZE_ID);
141  }
142  else if (Find(in_line, "ORIGIN")) {
143  done = false;
144  len = 0;
145  for (; !done && fgets(in_line, GBUFSIZ, file);) {
146  if (in_line[0] != '/') {
147  if (buflen == 0) {
148  buflen = GBUFSIZ;
149  ARB_calloc(buffer, buflen);
150  }
151  else if (len+strlen(in_line) >= buflen) {
152  size_t new_buflen = buflen+GBUFSIZ;
153  ARB_recalloc(buffer, buflen, new_buflen);
154  buflen = new_buflen;
155  }
156  // Search for the fist column of data (whitespace-number-whitespace)data
157  if (start_col == -1) {
158  for (start_col=0; in_line[start_col] == ' ' || in_line[start_col] == '\t'; start_col++) ;
159  for (start_col++; strchr("1234567890", in_line[start_col]); start_col++) ;
160  for (start_col++; in_line[start_col] == ' ' || in_line[start_col] == '\t'; start_col++) ;
161  }
162  for (int j=start_col; (c = in_line[j]) != '\0'; j++) {
163  if ((c != '\n') && ((j-start_col + 1) % 11 != 0)) {
164  buffer[len++] = c;
165  }
166  }
167  }
168  else {
169  AppendNA((NA_Base*)buffer, len, &(dataset.element[curelem]));
170  for (size_t j=0; j<len; j++) buffer[j] = '\0';
171  len = 0;
172  done = true;
173  dataset.element[curelem].comments = gencomments;
174  dataset.element[curelem].comments_len= genclen - 1;
175  dataset.element[curelem].comments_maxlen = genclen;
176 
177  gencomments = NULp;
178  genclen = 0;
179  }
180  }
181  /* Test if sequence should be converted by the translation table
182  * If it looks like a protein...
183  */
184  if (dataset.element[curelem].rmatrix && !IS_REALLY_AA) {
185  IS_REALLY_AA = CheckType((char*)dataset.element[curelem]. sequence, dataset.element[curelem].seqlen);
186 
187  if (!IS_REALLY_AA)
188  Ascii2NA((char*)dataset.element[curelem].sequence,
189  dataset.element[curelem].seqlen,
190  dataset.element[curelem].rmatrix);
191  else {
192  // Force the sequence to be AA
193  dataset.element[curelem].elementtype = PROTEIN;
194  dataset.element[curelem].rmatrix = NULp;
195  dataset.element[curelem].tmatrix = NULp;
196  dataset.element[curelem].col_lut = Default_PROColor_LKUP;
197  }
198  }
199  }
200  else if (Find(in_line, "ZZZZZ")) {
201  free(gencomments);
202  genclen = 0;
203  }
204  else {
205  if (!gencomments) {
206  gencomments = ARB_strdup(in_line);
207  genclen = strlen(gencomments)+1;
208  }
209  else {
210  genclen += strlen(in_line)+1;
211  ARB_realloc(gencomments, genclen);
212  strncat(gencomments, in_line, GBUFSIZ);
213  strncat(gencomments, "\n", GBUFSIZ);
214  }
215  }
216  }
217  free(buffer);
218  fclose(file);
219  }
220  for (size_t j=0; j<dataset.numelements; j++) {
221  dataset.maxlen = std::max(dataset.maxlen,
222  dataset.element[j].seqlen+dataset.element[j].offset);
223  }
224  return error;
225 }
226 
227 int WriteGen(NA_Alignment& aln, char *filename, int method) {
228  int i;
229  size_t j;
230  int k;
231  NA_Sequence *this_elem;
232 
233  FILE *file = fopen(filename, "w");
234  if (!file) {
235  Warning("Cannot open file for output");
236  return 1;
237  }
238 
239  for (j=0; j<aln.numelements; j++) {
240  this_elem = &(aln.element[j]);
241  if (method == ALL) {
242  fprintf(file,
243  "LOCUS %10s%8d bp %4s %10s %2d%5s%4d\n",
244  this_elem->short_name, this_elem->seqlen+this_elem->offset,
245  (this_elem->elementtype == DNA) ? "DNA" :
246  (this_elem->elementtype == RNA) ? "RNA" :
247  (this_elem->elementtype == MASK) ? "MASK" :
248  (this_elem->elementtype == PROTEIN) ? "PROT" : "TEXT",
249  this_elem->attr & IS_CIRCULAR ? "Circular" : "",
250  this_elem->t_stamp.origin.dd,
251  GDEmonth[this_elem->t_stamp.origin.mm-1],
252  (this_elem->t_stamp.origin.yy>1900) ? this_elem->t_stamp.origin.yy :
253  this_elem->t_stamp.origin.yy+1900);
254 
255  if (this_elem->description[0]) fprintf(file, "DEFINITION %s\n", this_elem->description);
256  if (this_elem->seq_name[0]) fprintf(file, " ORGANISM %s\n", this_elem->seq_name);
257  if (this_elem->id[0]) fprintf(file, " ACCESSION %s\n", this_elem->id);
258  if (this_elem->authority[0]) fprintf(file, " AUTHORS %s\n", this_elem->authority);
259  if (this_elem->comments) fprintf(file, "%s\n", this_elem->comments);
260 
261  fprintf(file, "ORIGIN");
262 
263  if (this_elem->tmatrix) {
264  for (i=0, k=0; k<this_elem->seqlen+this_elem->offset; k++) {
265  if (i%60 == 0) fprintf(file, "\n%9d", i+1);
266  if (i%10 == 0) fprintf(file, " ");
267  fprintf(file, "%c", this_elem->tmatrix[getelem(this_elem, k)]);
268  i++;
269  }
270  }
271  else {
272  for (i=0, k=0; k<this_elem->seqlen+this_elem->offset; k++) {
273  if (i%60 == 0) fprintf(file, "\n%9d", i+1);
274  if (i%10 == 0) fprintf(file, " ");
275  fprintf(file, "%c", getelem(this_elem, k));
276  i++;
277  }
278  }
279  fprintf(file, "\n//\n");
280  }
281  }
282  fclose(file);
283  return 0;
284 }
285 
286 
287 void SetTime(void *b) {
288  ARB_TIME *a=(ARB_TIME*)b;
289  struct tm *tim;
290  time_t clock;
291 
292  clock = time(NULp);
293  tim = localtime(&clock);
294 
295  a->yy = tim->tm_year;
296  a->mm = tim->tm_mon+1;
297  a->dd = tim->tm_mday;
298  a->hr = tim->tm_hour;
299  a->mn = tim->tm_min;
300  a->sc = tim->tm_sec;
301  return;
302 }
303 
#define MASK
Definition: GDE_def.h:44
const char * GB_ERROR
Definition: arb_core.h:25
char short_name[SIZE_SHORT_NAME]
Definition: GDE_def.h:86
int elementtype
Definition: GDE_def.h:107
char seq_name[SIZE_SEQ_NAME]
Definition: GDE_def.h:85
#define DNA
Definition: GDE_def.h:41
#define PROTEIN
Definition: GDE_def.h:45
TimeStamp t_stamp
Definition: GDE_def.h:96
#define SIZE_AUTHORITY
Definition: GDE_def.h:81
int seqlen
Definition: GDE_def.h:98
#define DEFAULT_X_ATTR
Definition: GDE_def.h:58
GB_ERROR GB_IO_error(const char *action, const char *filename)
Definition: arb_msg.cxx:285
char * ARB_strdup(const char *str)
Definition: arb_string.h:27
#define RNA
Definition: GDE_def.h:42
NA_Sequence * element
Definition: GDE_def.h:124
int * col_lut
Definition: GDE_def.h:102
int attr
Definition: GDE_def.h:100
int yy
Definition: GDE_def.h:64
char buffer[MESSAGE_BUFFERSIZE]
Definition: seq_search.cxx:34
FILE * seq
Definition: rns.c:46
int * rmatrix
Definition: GDE_def.h:111
#define GBUFSIZ
Definition: GDE_def.h:14
#define TEXT
Definition: GDE_def.h:43
NA_Base * sequence
Definition: GDE_def.h:95
void strcpy_truncate(char *dest, const char *source, size_t dest_size)
Definition: GDE_def.h:137
int dd
Definition: GDE_def.h:66
void AppendNA(NA_Base *buffer, int len, NA_Sequence *seq)
Definition: GDE_FileIO.cxx:244
void SetTime(void *b)
static void AsciiTime(void *b, char *asciitime)
Definition: GDE_Genbank.cxx:45
static void error(const char *msg)
Definition: mkptypes.cxx:96
#define SIZE_ID
Definition: GDE_def.h:77
#define IS_CIRCULAR
Definition: GDE_def.h:50
struct TimeStamp::@1 origin
void ARB_recalloc(TYPE *&tgt, size_t oelem, size_t nelem)
Definition: arb_mem.h:49
#define SIZE_SHORT_NAME
Definition: GDE_def.h:79
GB_ERROR ReadGen(char *filename, NA_Alignment &dataset)
Definition: GDE_Genbank.cxx:64
static bool CheckType(char *seq, int len)
Definition: GDE_Genbank.cxx:18
#define SIZE_SEQ_NAME
Definition: GDE_def.h:78
void Warning(const char *s)
Definition: GDE_FileIO.cxx:334
int comments_len
Definition: GDE_def.h:93
bool Find(const char *target, const char *key)
int comments_maxlen
Definition: GDE_def.h:93
unsigned char NA_Base
Definition: GDE_def.h:73
int mm
Definition: GDE_def.h:65
void Ascii2NA(char *buffer, int len, int matrix[16])
Definition: GDE_FileIO.cxx:259
int WriteGen(NA_Alignment &aln, char *filename, int method)
int offset
Definition: GDE_def.h:97
TYPE * ARB_calloc(size_t nelem)
Definition: arb_mem.h:81
int * tmatrix
Definition: GDE_def.h:110
int Default_PROColor_LKUP[]
Definition: GDE_global.cxx:69
size_t numelements
Definition: GDE_def.h:120
void ARB_realloc(TYPE *&tgt, size_t nelem)
Definition: arb_mem.h:43
int getelem(NA_Sequence *a, int b)
const char * GDEmonth[12]
Definition: GDE_global.cxx:16
int maxlen
Definition: GDE_def.h:122
#define SIZE_DESCRIPTION
Definition: GDE_def.h:80
#define NULp
Definition: cxxforward.h:116
char description[SIZE_DESCRIPTION]
Definition: GDE_def.h:90
char authority[SIZE_AUTHORITY]
Definition: GDE_def.h:91
char id[SIZE_ID]
Definition: GDE_def.h:84
void InitNASeq(NA_Sequence *seq, int type)
Definition: GDE_FileIO.cxx:339
int Arbdb_get_curelem(NA_Alignment &dataset)
char * comments
Definition: GDE_def.h:92
#define max(a, b)
Definition: f2c.h:154