ARB
ureadseq.c
Go to the documentation of this file.
1 /* File: ureadseq.c
2  *
3  * Reads and writes nucleic/protein sequence in various
4  * formats. Data files may have multiple sequences.
5  *
6  * Copyright 1990 by d.g.gilbert
7  * biology dept., indiana university, bloomington, in 47405
8  * e-mail: gilbertd@bio.indiana.edu
9  *
10  * This program may be freely copied and used by anyone.
11  * Developers are encourged to incorporate parts in their
12  * programs, rather than devise their own private sequence
13  * format.
14  *
15  * This should compile and run with any ANSI C compiler.
16  *
17  * ------------------------------------------------------------
18  *
19  * Some slight modifications were done by ARB developers
20  *
21  */
22 
23 
24 #include <stdio.h>
25 #include <stdlib.h>
26 #include <string.h>
27 #define __NO_CTYPE
28 #include <ctype.h>
29 
30 #include "ureadseq.h"
32 
33 
34 int Strcasecmp(const char *a, const char *b) /* from Nlm_StrICmp */
35 {
36  int diff, done;
37  if (a == b) return 0;
38  done = 0;
39  while (! done) {
40  diff = to_upper(*a) - to_upper(*b);
41  if (diff) return diff;
42  if (*a == '\0') done = 1;
43  else { a++; b++; }
44  }
45  return 0;
46 }
47 
48 int Strncasecmp(const char *a, const char *b, long maxn) /* from Nlm_StrNICmp */
49 {
50  int diff, done;
51  if (a == b) return 0;
52  done = 0;
53  while (! done) {
54  diff = to_upper(*a) - to_upper(*b);
55  if (diff) return diff;
56  if (*a == '\0') done = 1;
57  else {
58  a++; b++; maxn--;
59  if (! maxn) done = 1;
60  }
61  }
62  return 0;
63 }
64 
65 
66 
67 
68 
69 #ifndef Local
70 # define Local static /* local functions */
71 #endif
72 
73 #define kStartLength 500000 /* 20Apr93 temp. bug fix */
74 
75 const char *aminos = "ABCDEFGHIKLMNPQRSTVWXYZ*";
76 const char *primenuc = "ACGTU";
77 const char *protonly = "EFIPQZ";
78 
79 const char kNocountsymbols[5] = "_.-?";
80 const char stdsymbols[6] = "_.-*?";
81 const char allsymbols[32] = "_.-*?<>{}[]()!@#$%^&=+;:'/|`~\"\\";
82 static const char *seqsymbols = allsymbols;
83 
84 const char nummask[11] = "0123456789";
85 const char nonummask[11] = "~!@#$%^&*(";
86 
87 /*
88  use general form of isseqchar -- all chars + symbols.
89  no formats except nbrf (?) use symbols in data area as
90  anything other than sequence chars.
91 */
92 
93 
94 
95  /* Local variables for readSeq: */
96 struct ReadSeqVars {
97  short choice, err, nseq;
99  short topnseq;
100  long topseqlen;
101  const char *fname;
102  char *seq, *seqid, matchchar;
104  FILE *f;
105  long linestart;
106  char s[256], *sp;
107 
108  int (*isseqcharfirst8)(); /* Patch by o. strunk (ARB) to allow numbers in genbank sequences*/
110  /* int (*isseqchar)(int c); << sgi cc hates (int c) */
111 };
112 
113 
114 
115 int isSeqChar(int c)
116 {
117  return (isalpha(c) || strchr(seqsymbols,c));
118 }
119 
120 int isSeqNumChar(int c)
121 {
122  return (isalnum(c) || strchr(seqsymbols,c));
123 }
124 
125 
126 int isAnyChar(int c)
127 {
128  return isascii(c); /* wrap in case isascii is macro */
129 }
130 
131 Local void readline(FILE *f, char *s, long *linestart)
132 {
133  char *cp;
134 
135  *linestart= ftell(f);
136  if (NULL == fgets(s, 256, f))
137  *s = 0;
138  else {
139  cp = strchr(s, '\n');
140  if (cp != NULL) *cp = 0;
141  }
142 }
143 
144 Local void GetLine(struct ReadSeqVars *V)
145 {
146  readline(V->f, V->s, &V->linestart);
147 }
148 
149 Local void unGetLine(struct ReadSeqVars *V)
150 {
151  fseek(V->f, V->linestart, 0);
152 }
153 
154 
155 Local void addseq(char *s, struct ReadSeqVars * V)
156 {
157  char *ptr;
158  int count = 0;
159  if (V->addit) {
160  for (;*s != 0;s++,count++) {
161  if (count < 9 && V->isseqcharfirst8) {
162  if (!(V->isseqcharfirst8) (*s)) continue;
163  }
164  else {
165  if (!(V->isseqchar) (*s)) continue;
166  }
167  if (V->seqlen >= V->maxseq) {
168  V->maxseq += kStartLength;
169  ptr = (char *) realloc(V->seq, V->maxseq + 1);
170  if (ptr == NULL) {
171  V->err = eMemFull;
172  return;
173  } else
174  V->seq = ptr;
175  }
176  V->seq[(V->seqlen)++] = *s;
177  }
178  }
179 }
180 
181 Local void countseq(char *s, struct ReadSeqVars *V)
182  /* this must count all valid seq chars, for some formats (paup-sequential) even
183  if we are skipping seq... */
184 {
185  while (*s != 0) {
186  if ((V->isseqchar)(*s)) {
187  (V->seqlencount)++;
188  }
189  s++;
190  }
191 }
192 
193 
194 Local void addinfo(char *s, struct ReadSeqVars *V)
195 {
196  char s2[256], *si;
197  boolean saveadd;
198 
199  si = s2;
200  while (*s == ' ') s++;
201  sprintf(si, " %d) %s\n", V->nseq, s);
202 
203  saveadd = V->addit;
204  V->addit = true;
205  V->isseqchar = isAnyChar;
206  addseq( si, V);
207  V->addit = saveadd;
208  V->isseqchar = isSeqChar;
209 }
210 
211 
212 
213 
214 Local void readLoop(short margin, boolean addfirst,
215  boolean (*endTest)(boolean *addend, boolean *ungetend, struct ReadSeqVars *V),
216  struct ReadSeqVars *V)
217 {
218  boolean addend = false;
219  boolean ungetend = false;
220 
221  V->nseq++;
222  if (V->choice == kListSequences) V->addit = false;
223  else V->addit = (V->nseq == V->choice);
224  if (V->addit) V->seqlen = 0;
225 
226  if (addfirst) addseq(V->s, V);
227  do {
228  GetLine(V);
229  V->done = feof(V->f);
230  V->done |= (*endTest)( &addend, &ungetend, V);
231  if (V->addit && (addend || !V->done) && (strlen(V->s) > (unsigned)margin)) {
232  addseq( (V->s)+margin, V);
233  }
234  } while (!V->done);
235 
236  if (V->choice == kListSequences) addinfo(V->seqid, V);
237  else {
238  V->allDone = (V->nseq >= V->choice);
239  if (V->allDone && ungetend) unGetLine(V);
240  }
241 }
242 
243 
244 
245 Local boolean endIG( boolean *addend, boolean *ungetend, struct ReadSeqVars *V)
246 {
247  *addend = true; /* 1 or 2 occur in line w/ bases */
248  *ungetend= false;
249  return((strchr(V->s,'1')!=NULL) || (strchr(V->s,'2')!=NULL));
250 }
251 
252 Local void readIG(struct ReadSeqVars *V)
253 {
254 /* 18Aug92: new IG format -- ^L between sequences in place of ";" */
255  char *si;
256 
257  while (!V->allDone) {
258  do {
259  GetLine(V);
260  for (si= V->s; *si != 0 && *si < ' '; si++) *si= ' '; /* drop controls */
261  if (*si == 0) *V->s= 0; /* chop line to empty */
262  } while (! (feof(V->f) || ((*V->s != 0) && (*V->s != ';') ) ));
263  if (feof(V->f))
264  V->allDone = true;
265  else {
266  strcpy(V->seqid, V->s);
267  readLoop(0, false, endIG, V);
268  }
269  }
270 }
271 
272 
273 
274 Local boolean endStrider( boolean *addend, boolean *ungetend, struct ReadSeqVars *V)
275 {
276  *addend = false;
277  *ungetend= false;
278  return (strstr( V->s, "//") != NULL);
279 }
280 
282 { /* ? only 1 seq/file ? */
283 
284  while (!V->allDone) {
285  GetLine(V);
286  if (strstr(V->s,"; DNA sequence ") == V->s)
287  strcpy(V->seqid, (V->s)+16);
288  else
289  strcpy(V->seqid, (V->s)+1);
290  while ((!feof(V->f)) && (*V->s == ';')) {
291  GetLine(V);
292  }
293  if (feof(V->f)) V->allDone = true;
294  else readLoop(0, true, endStrider, V);
295  }
296 }
297 
298 
299 Local boolean endPIR( boolean *addend, boolean *ungetend, struct ReadSeqVars *V)
300 {
301  *addend = false;
302  *ungetend= (strstr(V->s,"ENTRY") == V->s);
303  return ((strstr(V->s,"///") != NULL) || *ungetend);
304 }
305 
306 Local void readPIR(struct ReadSeqVars *V)
307 { /*PIR -- many seqs/file */
308 
309  while (!V->allDone) {
310  while (! (feof(V->f) || strstr(V->s,"ENTRY") || strstr(V->s,"SEQUENCE")) )
311  GetLine(V);
312  strcpy(V->seqid, (V->s)+16);
313  while (! (feof(V->f) || strstr(V->s,"SEQUENCE") == V->s))
314  GetLine(V);
315  readLoop(0, false, endPIR, V);
316 
317  if (!V->allDone) {
318  while (! (feof(V->f) || ((*V->s != 0)
319  && (strstr( V->s,"ENTRY") == V->s))))
320  GetLine(V);
321  }
322  if (feof(V->f)) V->allDone = true;
323  }
324 }
325 
326 
327 Local boolean endGB( boolean *addend, boolean *ungetend, struct ReadSeqVars *V)
328 {
329  *addend = false;
330  *ungetend= (strstr(V->s,"LOCUS") == V->s);
331  return ((strstr(V->s,"//") != NULL) || *ungetend);
332 }
333 
334 Local void readGenBank(struct ReadSeqVars * V)
335 { /* GenBank -- many seqs/file */
336 
337  V->isseqchar = isSeqNumChar;
339 
340  while (!V->allDone) {
341  strcpy(V->seqid, (V->s)+12);
342  while (! (feof(V->f) || strstr(V->s,"ORIGIN") == V->s))
343  GetLine(V);
344  readLoop(0, false, endGB, V);
345 
346  if (!V->allDone) {
347  while (! (feof(V->f) || ((*V->s != 0)
348  && (strstr( V->s, "LOCUS") == V->s))))
349  GetLine(V);
350  }
351  if (feof(V->f)) V->allDone = true;
352  }
353 
354  V->isseqchar = isSeqChar;
355  V->isseqcharfirst8 = 0;
356 }
357 
358 
359 Local boolean endNBRF( boolean *addend, boolean *ungetend, struct ReadSeqVars *V)
360 {
361  char *a;
362 
363  if ((a = strchr(V->s, '*')) != NULL) { /* end of 1st seq */
364  /* "*" can be valid base symbol, drop it here */
365  *a = 0;
366  *addend = true;
367  *ungetend= false;
368  return(true);
369  }
370  else if (*V->s == '>') { /* start of next seq */
371  *addend = false;
372  *ungetend= true;
373  return(true);
374  }
375  else
376  return(false);
377 }
378 
379 Local void readNBRF(struct ReadSeqVars *V)
380 {
381  while (!V->allDone) {
382  strcpy(V->seqid, (V->s)+4);
383  GetLine(V); /*skip title-junk line*/
384  readLoop(0, false, endNBRF, V);
385  if (!V->allDone) {
386  while (!(feof(V->f) || (*V->s != 0 && *V->s == '>')))
387  GetLine(V);
388  }
389  if (feof(V->f)) V->allDone = true;
390  }
391 }
392 
393 
394 
395 Local boolean endPearson( boolean *addend, boolean *ungetend, struct ReadSeqVars *V)
396 {
397  *addend = false;
398  *ungetend= true;
399  return(*V->s == '>');
400 }
401 
403 {
404  while (!V->allDone) {
405  strcpy(V->seqid, (V->s)+1);
406  readLoop(0, false, endPearson, V);
407  if (!V->allDone) {
408  while (!(feof(V->f) || ((*V->s != 0) && (*V->s == '>'))))
409  GetLine(V);
410  }
411  if (feof(V->f)) V->allDone = true;
412  }
413 }
414 
415 
416 
417 Local boolean endEMBL( boolean *addend, boolean *ungetend, struct ReadSeqVars *V)
418 {
419  *addend = false;
420  *ungetend= (strstr(V->s,"ID ") == V->s);
421  return ((strstr(V->s,"//") != NULL) || *ungetend);
422 }
423 
424 Local void readEMBL(struct ReadSeqVars *V)
425 {
426  while (!V->allDone) {
427  strcpy(V->seqid, (V->s)+5);
428  do {
429  GetLine(V);
430  } while (!(feof(V->f) | (strstr(V->s,"SQ ") == V->s)));
431 
432  readLoop(0, false, endEMBL, V);
433  if (!V->allDone) {
434  while (!(feof(V->f) |
435  ((*V->s != '\0') & (strstr(V->s,"ID ") == V->s))))
436  GetLine(V);
437  }
438  if (feof(V->f)) V->allDone = true;
439  }
440 }
441 
442 
443 
444 Local boolean endZuker( boolean *addend, boolean *ungetend, struct ReadSeqVars *V)
445 {
446  *addend = false;
447  *ungetend= true;
448  return( *V->s == '(' );
449 }
450 
451 Local void readZuker(struct ReadSeqVars *V)
452 {
455  while (!V->allDone) {
456  GetLine(V); /*s == "seqLen seqid string..."*/
457  strcpy(V->seqid, (V->s)+6);
458  readLoop(0, false, endZuker, V);
459  if (!V->allDone) {
460  while (!(feof(V->f) |
461  ((*V->s != '\0') & (*V->s == '('))))
462  GetLine(V);
463  }
464  if (feof(V->f)) V->allDone = true;
465  }
466 }
467 
468 
469 
470 Local boolean endFitch( boolean *addend, boolean *ungetend, struct ReadSeqVars *V)
471 {
472  /* this is a somewhat shaky end,
473  1st char of line is non-blank for seq. title
474  */
475  *addend = false;
476  *ungetend= true;
477  return( *V->s != ' ' );
478 }
479 
480 Local void readFitch(struct ReadSeqVars *V)
481 {
482  boolean first;
483 
484  first = true;
485  while (!V->allDone) {
486  if (!first) strcpy(V->seqid, V->s);
487  readLoop(0, first, endFitch, V);
488  if (feof(V->f)) V->allDone = true;
489  first = false;
490  }
491 }
492 
493 
494 Local void readPlain(struct ReadSeqVars *V)
495 {
496  V->nseq++;
497  V->addit = (V->choice > 0);
498  if (V->addit) V->seqlen = 0;
499  addseq(V->seqid, V); /*from above..*/
500  if (V->fname!=NULL) sprintf(V->seqid, "%s [Unknown form]", V->fname);
501  else sprintf(V->seqid, " [Unknown form]");
502  do {
503  addseq(V->s, V);
504  V->done = feof(V->f);
505  GetLine(V);
506  } while (!V->done);
507  if (V->choice == kListSequences) addinfo(V->seqid, V);
508  V->allDone = true;
509 }
510 
511 
512 Local void readUWGCG(struct ReadSeqVars *V)
513 {
514 /*
515 10nov91: Reading GCG files casued duplication of last line when
516  EOF followed that line !!!
517  fix: GetLine now sets *V->s = 0
518 */
519  char *si;
520 
521  V->nseq++;
522  V->addit = (V->choice > 0);
523  if (V->addit) V->seqlen = 0;
524  strcpy(V->seqid, V->s);
525  /*writeseq: " %s Length: %d (today) Check: %d ..\n" */
526  /*drop above or ".." from id*/
527  if ((si = strstr(V->seqid," Length: "))) *si = 0;
528  else if ((si = strstr(V->seqid,".."))) *si = 0;
529  do {
530  V->done = feof(V->f);
531  GetLine(V);
532  if (!V->done) addseq((V->s), V);
533  } while (!V->done);
534  if (V->choice == kListSequences) addinfo(V->seqid, V);
535  V->allDone = true;
536 }
537 
538 
539 Local void readOlsen(struct ReadSeqVars *V)
540 { /* G. Olsen /print output from multiple sequence editor */
541 
542  char *si, *sj, *sk, *sm, sid[40], snum[20];
543  boolean indata = false;
544  int snumlen;
545 
546  V->addit = (V->choice > 0);
547  if (V->addit) V->seqlen = 0;
548  rewind(V->f); V->nseq= 0;
549  do {
550  GetLine(V);
551  V->done = feof(V->f);
552 
553  if (V->done && !(*V->s)) break;
554  else if (indata) {
555  if ( (si= strstr(V->s, sid))
556  /* && (strstr(V->s, snum) == si - snumlen - 1) ) { */
557  && (sm= strstr(V->s, snum)) && (sm < si - snumlen) ) {
558 
559  /* Spaces are valid alignment data !! */
560 /* 17Oct91: Error, the left margin is 21 not 22! */
561 /* dropped some nucs up to now -- my example file was right shifted ! */
562 /* variable right id margin, drop id-2 spaces at end */
563 /*
564  VMS CC COMPILER (VAXC031) mess up:
565  -- Index of 21 is chopping 1st nuc on VMS systems Only!
566  Byte-for-byte same ame rnasep.olsen sequence file !
567 */
568 
569  /* si = (V->s)+21; < was this before VMS CC wasted my time */
570  si += 10; /* use strstr index plus offset to outfox VMS CC bug */
571 
572  if ((sk = strstr(si, sid))) *(sk-2) = 0;
573  for (sk = si; *sk != 0; sk++) {
574  if (*sk == ' ') *sk = '.';
575  /* 18aug92: !! some olsen masks are NUMBERS !! which addseq eats */
576  else if (isdigit(*sk)) *sk= nonummask[*sk - '0'];
577  }
578 
579  addseq(si, V);
580  }
581  }
582 
583  else if ((sk = strstr(V->s, "): "))) { /* seq info header line */
584  /* 18aug92: correct for diff seqs w/ same name -- use number, e.g. */
585  /* 3 (Agr.tume): agrobacterium.prna 18-JUN-1987 16:12 */
586  /* 328 (Agr.tume): agrobacterium.prna XYZ 19-DEC-1992 */
587  (V->nseq)++;
588  si = 1 + strchr(V->s,'(');
589  *sk = ' ';
590  if (V->choice == kListSequences) addinfo( si, V);
591  else if (V->nseq == V->choice) {
592  strcpy(V->seqid, si);
593  sj = strchr(V->seqid, ':');
594  while (*(--sj) == ' ') ;
595  while (--sj != V->seqid) { if (*sj == ' ') *sj = '_'; }
596 
597  *sk = 0;
598  while (*(--sk) == ' ') *sk = 0;
599  strcpy(sid, si);
600 
601  si= V->s;
602  while ((*si <= ' ') && (*si != 0)) si++;
603  snumlen=0;
604  while (si[snumlen] > ' ' && snumlen<20)
605  { snum[snumlen]= si[snumlen]; snumlen++; }
606  snum[snumlen]= 0;
607  }
608 
609  }
610 
611  else if (strstr(V->s,"identity: Data:")) {
612  indata = true;
613  if (V->choice == kListSequences) V->done = true;
614  }
615 
616  } while (!V->done);
617 
618  V->allDone = true;
619 } /*readOlsen*/
620 
621 
622 Local void readMSF(struct ReadSeqVars *V)
623 { /* gcg's MSF, mult. sequence format, interleaved ! */
624 
625  char *si, *sj, sid[128];
626  boolean indata = false;
627  int iline= 0;
628 
629  V->addit = (V->choice > 0);
630  if (V->addit) V->seqlen = 0;
631  rewind(V->f); V->nseq= 0;
632  do {
633  GetLine(V);
634  V->done = feof(V->f);
635 
636  if (V->done && !(*V->s)) break;
637  else if (indata) {
638  /*somename ...gpvedai .......t.. aaigr..vad tvgtgptnse aipaltaaet */
639  /* E gvenae.kgv tentna.tad fvaqpvylpe .nqt...... kv.affynrs */
640 
641  si= V->s;
642  skipwhitespace(si);
643  /* for (sj= si; isalnum(*sj); sj++) ; bug -- cdelwiche uses "-", "_" and others in names*/
644  for (sj= si; *sj > ' '; sj++) ;
645  *sj= 0;
646  if ( *si ) {
647  if ( (0==strcmp(si, sid)) ) {
648  addseq(sj+1, V);
649  }
650  iline++;
651  }
652  }
653 
654  else if (NULL != (si = strstr(V->s, "Name: "))) { /* seq info header line */
655  /* Name: somename Len: 100 Check: 7009 Weight: 1.00 */
656 
657  (V->nseq)++;
658  si += 6;
659  if (V->choice == kListSequences) addinfo( si, V);
660  else if (V->nseq == V->choice) {
661  strcpy(V->seqid, si);
662  si = V->seqid;
663  skipwhitespace(si);
664  /* for (sj= si; isalnum(*sj); sj++) ; -- bug */
665  for (sj= si; *sj > ' '; sj++) ;
666  *sj= 0;
667  strcpy(sid, si);
668  }
669  }
670 
671  else if ( strstr(V->s,"//") /*== V->s*/ ) {
672  indata = true;
673  iline= 0;
674  if (V->choice == kListSequences) V->done = true;
675  }
676 
677  } while (!V->done);
678 
679 
680  V->allDone = true;
681 } /*readMSF*/
682 
683 
684 
686 { /* PAUP mult. sequence format, interleaved or sequential! */
687 
688  char *si, *sj, *send, sid[40], sid1[40], saveseq[255];
689  boolean first = true, indata = false, domatch;
690  int iline= 0, ifmc, saveseqlen=0;
691 
692 #define fixmatchchar(s) { \
693  for (ifmc=0; ifmc<saveseqlen; ifmc++) \
694  if (s[ifmc] == V->matchchar) s[ifmc]= saveseq[ifmc]; }
695 
696  V->addit = (V->choice > 0);
697  V->seqlencount = 0;
698  if (V->addit) V->seqlen = 0;
699  /* rewind(V->f); V->nseq= 0; << do in caller !*/
700  indata= true; /* call here after we find "matrix" */
701  domatch= (V->matchchar > 0);
702 
703  do {
704  GetLine(V);
705  V->done = feof(V->f);
706 
707  if (V->done && !(*V->s)) break;
708  else if (indata) {
709  /* [ 1 1 1 ]*/
710  /* human aagcttcaccggcgcagtca ttctcataatcgcccacggR cttacatcct*/
711  /* chimp ................a.t. .c.................a ..........*/
712  /* !! need to correct for V->matchchar */
713  si= V->s;
714  skipwhitespace(si);
715  if (strchr(si,';')) indata= false;
716 
717  if (isalnum(*si)) {
718  /* valid data line starts w/ a left-justified seq name in columns [0..8] */
719  if (first) {
720  (V->nseq)++;
721  if (V->nseq >= V->topnseq) first= false;
722  for (sj = si; isalnum(*sj); sj++) ;
723  send= sj;
724  skipwhitespace(sj);
725  if (V->choice == kListSequences) {
726  *send= 0;
727  addinfo( si, V);
728  }
729  else if (V->nseq == V->choice) {
730  if (domatch) {
731  if (V->nseq == 1) { strcpy( saveseq, sj); saveseqlen= strlen(saveseq); }
732  else fixmatchchar( sj);
733  }
734  addseq(sj, V);
735  *send= 0;
736  strcpy(V->seqid, si);
737  strcpy(sid, si);
738  if (V->nseq == 1) strcpy(sid1, sid);
739  }
740  }
741 
742  else if ( (strstr(si, sid) == si) ){
743  while (isalnum(*si)) si++;
744  skipwhitespace(si);
745  if (domatch) {
746  if (V->nseq == 1) { strcpy( saveseq, si); saveseqlen= strlen(saveseq); }
747  else fixmatchchar( si);
748  }
749  addseq(si, V);
750  }
751 
752  else if (domatch && (strstr(si, sid1) == si)) {
753  strcpy( saveseq, si);
754  saveseqlen= strlen(saveseq);
755  }
756 
757  iline++;
758  }
759  }
760 
761  else if ( strstr(V->s,"matrix") ) {
762  indata = true;
763  iline= 0;
764  if (V->choice == kListSequences) V->done = true;
765  }
766 
767  } while (!V->done);
768 
769  V->allDone = true;
770 } /*readPAUPinterleaved*/
771 
772 
773 
775 { /* PAUP mult. sequence format, interleaved or sequential! */
776  char *si, *sj;
777  boolean atname = true, indata = false;
778 
779  V->addit = (V->choice > 0);
780  if (V->addit) V->seqlen = 0;
781  V->seqlencount = 0;
782  /* rewind(V->f); V->nseq= 0; << do in caller !*/
783  indata= true; /* call here after we find "matrix" */
784  do {
785  GetLine(V);
786  V->done = feof(V->f);
787 
788  if (V->done && !(*V->s)) break;
789  else if (indata) {
790  /* [ 1 1 1 ]*/
791  /* human aagcttcaccggcgcagtca ttctcataatcgcccacggR cttacatcct*/
792  /* aagcttcaccggcgcagtca ttctcataatcgcccacggR cttacatcct*/
793  /* chimp ................a.t. .c.................a ..........*/
794  /* ................a.t. .c.................a ..........*/
795 
796  si= V->s;
797  skipwhitespace(si);
798  if (strchr(si,';')) indata= false;
799  if (isalnum(*si)) {
800  /* valid data line starts w/ a left-justified seq name in columns [0..8] */
801  if (atname) {
802  (V->nseq)++;
803  V->seqlencount = 0;
804  atname= false;
805  sj= si+1;
806  while (isalnum(*sj)) sj++;
807  if (V->choice == kListSequences) {
808  /* !! we must count bases to know when topseqlen is reached ! */
809  countseq(sj, V);
810  if (V->seqlencount >= V->topseqlen) atname= true;
811  *sj= 0;
812  addinfo( si, V);
813  }
814  else if (V->nseq == V->choice) {
815  addseq(sj, V);
816  V->seqlencount= V->seqlen;
817  if (V->seqlencount >= V->topseqlen) atname= true;
818  *sj= 0;
819  strcpy(V->seqid, si);
820  }
821  else {
822  countseq(sj, V);
823  if (V->seqlencount >= V->topseqlen) atname= true;
824  }
825  }
826 
827  else if (V->nseq == V->choice) {
828  addseq(V->s, V);
829  V->seqlencount= V->seqlen;
830  if (V->seqlencount >= V->topseqlen) atname= true;
831  }
832  else {
833  countseq(V->s, V);
834  if (V->seqlencount >= V->topseqlen) atname= true;
835  }
836  }
837  }
838 
839  else if ( strstr(V->s,"matrix") ) {
840  indata = true;
841  atname= true;
842  if (V->choice == kListSequences) V->done = true;
843  }
844 
845  } while (!V->done);
846 
847  V->allDone = true;
848 } /*readPAUPsequential*/
849 
850 
852 {
853  char *si, *sj;
854  boolean first = true;
855  int iline= 0;
856 
857  V->addit = (V->choice > 0);
858  if (V->addit) V->seqlen = 0;
859  V->seqlencount = 0;
860  /* sscanf( V->s, "%d%d", &V->topnseq, &V->topseqlen); << topnseq == 0 !!! bad scan !! */
861  si= V->s;
862  skipwhitespace(si);
863  V->topnseq= atoi(si);
864  while (isdigit(*si)) si++;
865  skipwhitespace(si);
866  V->topseqlen= atol(si);
867  /* fprintf(stderr,"Phylip-ileaf: topnseq=%d topseqlen=%d\n",V->topnseq, V->topseqlen); */
868 
869  do {
870  GetLine(V);
871  V->done = feof(V->f);
872 
873  if (V->done && !(*V->s)) break;
874  si= V->s;
875  skipwhitespace(si);
876  if (*si != 0) {
877 
878  if (first) { /* collect seq names + seq, as fprintf(outf,"%-10s ",seqname); */
879  (V->nseq)++;
880  if (V->nseq >= V->topnseq) first= false;
881  sj= V->s+10; /* past name, start of data */
882  if (V->choice == kListSequences) {
883  *sj= 0;
884  addinfo( si, V);
885  }
886  else if (V->nseq == V->choice) {
887  addseq(sj, V);
888  *sj= 0;
889  strcpy(V->seqid, si);
890  }
891  }
892  else if ( iline % V->nseq == V->choice -1 ) {
893  addseq(si, V);
894  }
895  iline++;
896  }
897  } while (!V->done);
898 
899  V->allDone = true;
900 } /*readPhylipInterleaved*/
901 
902 
903 
904 Local boolean endPhylipSequential( boolean *addend, boolean *ungetend, struct ReadSeqVars *V)
905 {
906  *addend = false;
907  *ungetend= false;
908  countseq( V->s, V);
909  return V->seqlencount >= V->topseqlen;
910 }
911 
913 {
914  short i;
915  char *si;
916  /* sscanf( V->s, "%d%d", &V->topnseq, &V->topseqlen); < ? bad sscan ? */
917  si= V->s;
918  skipwhitespace(si);
919  V->topnseq= atoi(si);
920  while (isdigit(*si)) si++;
921  skipwhitespace(si);
922  V->topseqlen= atol(si);
923  GetLine(V);
924  while (!V->allDone) {
925  V->seqlencount= 0;
926  strncpy(V->seqid, (V->s), 10);
927  V->seqid[10]= 0;
928  for (i=0; i<10 && V->s[i]; i++) V->s[i]= ' ';
929  readLoop(0, true, endPhylipSequential, V);
930  if (feof(V->f)) V->allDone = true;
931  }
932 }
933 
934 
935 
936 
938  struct ReadSeqVars *V,
939  const long skiplines_,
940  const short format_)
941 {
942 #define tolowerstr(s) { long Itlwr, Ntlwr= strlen(s); \
943  for (Itlwr=0; Itlwr<Ntlwr; Itlwr++) s[Itlwr]= to_lower(s[Itlwr]); }
944 
945  boolean gotuw;
946  long l;
947 
948  V->linestart= 0;
949  V->matchchar= 0;
950  if (V->f == NULL)
951  V->err = eFileNotFound;
952  else {
953 
954  for (l = skiplines_; l > 0; l--) GetLine( V);
955 
956  do {
957  GetLine( V);
958  for (l= strlen(V->s); (l > 0) && (V->s[l] == ' '); l--) ;
959  } while ((l == 0) && !feof(V->f));
960 
961  if (feof(V->f)) V->err = eNoData;
962 
963  else switch (format_) {
964  case kPlain : readPlain(V); break;
965  case kIG : readIG(V); break;
966  case kStrider: readStrider(V); break;
967  case kGenBank: readGenBank(V); break;
968  case kPIR : readPIR(V); break;
969  case kNBRF : readNBRF(V); break;
970  case kPearson: readPearson(V); break;
971  case kEMBL : readEMBL(V); break;
972  case kZuker : readZuker(V); break;
973  case kOlsen : readOlsen(V); break;
974  case kMSF : readMSF(V); break;
975 
976  case kPAUP : {
977  boolean done= false;
978  boolean interleaved= false;
979  char *cp;
980  /* rewind(V->f); V->nseq= 0; ?? assume it is at top ?? skiplines ... */
981  while (!done) {
982  GetLine( V);
983  tolowerstr( V->s);
984  if (strstr( V->s, "matrix")) done= true;
985  if (strstr( V->s, "interleav")) interleaved= true;
986  if (NULL != (cp=strstr( V->s, "ntax=")) ) V->topnseq= atoi(cp+5);
987  if (NULL != (cp=strstr( V->s, "nchar=")) ) V->topseqlen= atoi(cp+6);
988  if (NULL != (cp=strstr( V->s, "matchchar=")) ) {
989  cp += 10;
990  if (*cp=='\'') cp++;
991  else if (*cp=='"') cp++;
992  V->matchchar= *cp;
993  }
994  }
995  if (interleaved) readPAUPinterleaved(V);
996  else readPAUPsequential(V);
997  }
998  break;
999 
1000  /* kPhylip: ! can't determine in middle of file which type it is...*/
1001  /* test for interleave or sequential and use Phylip4(ileave) or Phylip2 */
1002  case kPhylip2:
1004  break;
1005  case kPhylip4: /* == kPhylip3 */
1007  break;
1008 
1009  default:
1010  V->err = eUnknownFormat;
1011  break;
1012 
1013  case kFitch :
1014  strcpy(V->seqid, V->s); GetLine(V);
1015  readFitch(V);
1016  break;
1017 
1018  case kGCG:
1019  do {
1020  gotuw = (strstr(V->s,"..") != NULL);
1021  if (gotuw) readUWGCG(V);
1022  GetLine(V);
1023  } while (!(feof(V->f) || V->allDone));
1024  break;
1025  }
1026  }
1027 
1028  V->filestart= false;
1029  V->seq[V->seqlen] = 0; /* stick a string terminator on it */
1030 }
1031 
1032 
1034  const short whichEntry_, /* index to sequence in file */
1035  FILE *fp_, /* pointer to open seq file */
1036  const long skiplines_,
1037  const short format_, /* sequence file format */
1038  long *seqlen_, /* return seq size */
1039  short *nseq_, /* number of seqs in file, for listSeqs() */
1040  short *error_, /* return error */
1041  char *seqid_) /* return seq name/info */
1042 {
1043  struct ReadSeqVars V;
1044 
1045  if (format_ < kMinFormat || format_ > kMaxFormat) {
1046  *error_ = eUnknownFormat;
1047  *seqlen_ = 0;
1048  return NULL;
1049  }
1050 
1051  V.choice = whichEntry_;
1052  V.fname = NULL; /* don't know */
1053  V.seq = (char*) calloc(1, kStartLength+1);
1054  V.maxseq = kStartLength;
1055  V.seqlen = 0;
1056  V.seqid = seqid_;
1057 
1058  V.f = fp_;
1059  V.filestart= (ftell( fp_) == 0);
1060  /* !! in sequential read, must remove current seq position from choice/whichEntry_ counter !! ... */
1061  if (V.filestart) V.nseq = 0;
1062  else V.nseq= *nseq_; /* track where we are in file...*/
1063 
1064  *V.seqid = '\0';
1065  V.err = 0;
1066  V.nseq = 0;
1067  V.isseqchar = isSeqChar;
1068  V.isseqcharfirst8 = 0;
1069  if (V.choice == kListSequences) ; /* leave as is */
1070  else if (V.choice <= 0) V.choice = 1; /* default ?? */
1071  V.addit = (V.choice > 0);
1072  V.allDone = false;
1073 
1074  readSeqMain(&V, skiplines_, format_);
1075 
1076  *error_ = V.err;
1077  *seqlen_ = V.seqlen;
1078  *nseq_ = V.nseq;
1079  return V.seq;
1080 }
1081 
1082 char *readSeq(
1083  const short whichEntry_, /* index to sequence in file */
1084  const char *filename_, /* file name */
1085  const long skiplines_,
1086  const short format_, /* sequence file format */
1087  long *seqlen_, /* return seq size */
1088  short *nseq_, /* number of seqs in file, for listSeqs() */
1089  short *error_, /* return error */
1090  char *seqid_) /* return seq name/info */
1091 {
1092  struct ReadSeqVars V;
1093 
1094  if (format_ < kMinFormat || format_ > kMaxFormat) {
1095  *error_ = eUnknownFormat;
1096  *seqlen_ = 0;
1097  return NULL;
1098  }
1099 
1100  V.choice = whichEntry_;
1101  V.fname = filename_; /* don't need to copy string, just ptr to it */
1102  V.seq = (char*) calloc(1, kStartLength+1);
1103  V.maxseq = kStartLength;
1104  V.seqlen = 0;
1105  V.seqid = seqid_;
1106 
1107  V.f = NULL;
1108  *V.seqid = '\0';
1109  V.err = 0;
1110  V.nseq = 0;
1111  V.isseqchar = isSeqChar;
1112  V.isseqcharfirst8 = 0;
1113  if (V.choice == kListSequences) ; /* leave as is */
1114  else if (V.choice <= 0) V.choice = 1; /* default ?? */
1115  V.addit = (V.choice > 0);
1116  V.allDone = false;
1117 
1118  V.f = fopen(V.fname, "r");
1119  V.filestart= true;
1120 
1121  readSeqMain(&V, skiplines_, format_);
1122 
1123  if (V.f != NULL) fclose(V.f);
1124  *error_ = V.err;
1125  *seqlen_ = V.seqlen;
1126  *nseq_ = V.nseq;
1127  return V.seq;
1128 }
1129 
1130 
1131 
1132 
1133 
1134 char *listSeqs(
1135  const char *filename_, /* file name */
1136  const long skiplines_,
1137  const short format_, /* sequence file format */
1138  short *nseq_, /* number of seqs in file, for listSeqs() */
1139  short *error_) /* return error */
1140 {
1141  char seqid[256];
1142  long seqlen;
1143 
1144  return readSeq( kListSequences, filename_, skiplines_, format_,
1145  &seqlen, nseq_, error_, seqid);
1146 }
1147 
1148 
1149 
1150 
1151 short seqFileFormat( /* return sequence format number, see ureadseq.h */
1152  const char *filename,
1153  long *skiplines, /* return #lines to skip any junk like mail header */
1154  short *error) /* return any error value or 0 */
1155 {
1156  FILE *fseq;
1157  short format;
1158 
1159  fseq = fopen(filename, "r");
1160  format= seqFileFormatFp( fseq, skiplines, error);
1161  if (fseq!=NULL) fclose(fseq);
1162  return format;
1163 }
1164 
1166  FILE *fseq,
1167  long *skiplines, /* return #lines to skip any junk like mail header */
1168  short *error) /* return any error value or 0 */
1169 {
1170  boolean foundIG= false, foundStrider= false,
1171  foundGB= false, foundPIR= false, foundEMBL= false, foundNBRF= false,
1172  foundPearson= false, foundFitch= false, foundPhylip= false, foundZuker= false,
1173  gotolsen= false, gotpaup = false, gotasn1 = false, gotuw= false, gotMSF= false,
1174  isfitch= false, isphylip= false, done= false;
1175  short format= kUnknown;
1176  int nlines= 0, k, splen= 0, otherlines= 0, aminolines= 0, dnalines= 0;
1177  char sp[256];
1178  long linestart=0;
1179  int maxlines2check=500;
1180 
1181 #define ReadOneLine(sp) \
1182  { done |= (feof(fseq)); \
1183  readline( fseq, sp, &linestart); \
1184  if (!done) { splen = strlen(sp); ++nlines; } }
1185 
1186  *skiplines = 0;
1187  *error = 0;
1188  if (fseq == NULL) { *error = eFileNotFound; return kNoformat; }
1189 
1190  while ( !done ) {
1191  ReadOneLine(sp);
1192 
1193  /* check for mailer head & skip past if found */
1194  if (nlines < 4 && !done) {
1195  if ((strstr(sp,"From ") == sp) || (strstr(sp,"Received:") == sp)) {
1196  do {
1197  /* skip all lines until find one blank line */
1198  ReadOneLine(sp);
1199  if (!done) for (k=0; (k<splen) && (sp[k]==' '); k++) ;
1200  } while ((!done) && (k < splen));
1201  *skiplines = nlines; /* !? do we want #lines or #bytes ?? */
1202  }
1203  }
1204 
1205  if (sp==NULL || *sp==0)
1206  ; /* nada */
1207 
1208  /* high probability identities: */
1209 
1210  else if ( strstr(sp,"MSF:") && strstr(sp,"Type:") && strstr(sp,"Check:") )
1211  gotMSF= true;
1212 
1213  else if ((strstr(sp,"..") != NULL) && (strstr(sp,"Check:") != NULL))
1214  gotuw= true;
1215 
1216  else if (strstr(sp,"identity: Data:") != NULL)
1217  gotolsen= true;
1218 
1219  else if ( strstr(sp,"::=") &&
1220  (strstr(sp,"Bioseq") || /* Bioseq or Bioseq-set */
1221  strstr(sp,"Seq-entry") ||
1222  strstr(sp,"Seq-submit") ) ) /* can we read submit format? */
1223  gotasn1= true;
1224 
1225  else if ( strstr(sp,"#NEXUS") == sp )
1226  gotpaup= true;
1227 
1228  /* uncertain identities: */
1229 
1230  else if (*sp ==';') {
1231  if (strstr(sp,"Strider") !=NULL) foundStrider= true;
1232  else foundIG= true;
1233  }
1234 
1235  else if (strstr(sp,"LOCUS") == sp)
1236  foundGB= true;
1237  else if (strstr(sp,"ORIGIN") == sp)
1238  foundGB= true;
1239 
1240  else if (strstr(sp,"ENTRY ") == sp) /* ? also (strcmp(sp,"\\\\\\")==0) */
1241  foundPIR= true;
1242  else if (strstr(sp,"SEQUENCE") == sp)
1243  foundPIR= true;
1244 
1245  else if (*sp == '>') {
1246  if (sp[3] == ';') foundNBRF= true;
1247  else foundPearson= true;
1248  }
1249 
1250  else if (strstr(sp,"ID ") == sp)
1251  foundEMBL= true;
1252  else if (strstr(sp,"SQ ") == sp)
1253  foundEMBL= true;
1254 
1255  else if (*sp == '(')
1256  foundZuker= true;
1257 
1258  else {
1259  if (nlines - *skiplines == 1) {
1260  int ispp= 0, ilen= 0;
1261  sscanf( sp, "%d%d", &ispp, &ilen);
1262  if (ispp > 0 && ilen > 0) isphylip= true;
1263  }
1264  else if (isphylip && nlines - *skiplines == 2) {
1265  int tseq;
1266  tseq= getseqtype(sp+10, strlen(sp+10));
1267  if ( isalpha(*sp) /* 1st letter in 2nd line must be of a name */
1268  && (tseq != kOtherSeq)) /* sequence section must be okay */
1269  foundPhylip= true;
1270  }
1271 
1272  for (k=0, isfitch= true; isfitch & (k < splen); k++) {
1273  if (k % 4 == 0) isfitch &= (sp[k] == ' ');
1274  else isfitch &= (sp[k] != ' ');
1275  }
1276  if (isfitch & (splen > 20)) foundFitch= true;
1277 
1278  /* kRNA && kDNA are fairly certain...*/
1279  switch (getseqtype( sp, splen)) {
1280  case kOtherSeq: otherlines++; break;
1281  case kAmino : if (splen>20) aminolines++; break;
1282  case kDNA :
1283  case kRNA : if (splen>20) dnalines++; break;
1284  case kNucleic : break; /* not much info ? */
1285  }
1286 
1287  }
1288 
1289  /* pretty certain */
1290  if (gotolsen) {
1291  format= kOlsen;
1292  done= true;
1293  }
1294  else if (gotMSF) {
1295  format= kMSF;
1296  done= true;
1297  }
1298  else if (gotasn1) {
1299  /* !! we need to look further and return kASNseqentry | kASNseqset */
1300  /*
1301  seqentry key is Seq-entry ::=
1302  seqset key is Bioseq-set ::=
1303  ?? can't read these yet w/ ncbi tools ??
1304  Seq-submit ::=
1305  Bioseq ::= << fails both bioseq-seq and seq-entry parsers !
1306  */
1307  if (strstr(sp,"Bioseq-set")) format= kASNseqset;
1308  else if (strstr(sp,"Seq-entry")) format= kASNseqentry;
1309  else format= kASN1; /* other form, we can't yet read... */
1310  done= true;
1311  }
1312  else if (gotpaup) {
1313  format= kPAUP;
1314  done= true;
1315  }
1316 
1317  else if (gotuw) {
1318  if (foundIG) format= kIG; /* a TOIG file from GCG for certain */
1319  else format= kGCG;
1320  done= true;
1321  }
1322 
1323  else if ((dnalines > 1) || done || (nlines > maxlines2check)) {
1324  /* decide on most likely format */
1325  /* multichar idents: */
1326  if (foundStrider) format= kStrider;
1327  else if (foundGB) format= kGenBank;
1328  else if (foundPIR) format= kPIR;
1329  else if (foundEMBL) format= kEMBL;
1330  else if (foundNBRF) format= kNBRF;
1331  /* single char idents: */
1332  else if (foundIG) format= kIG;
1333  else if (foundPearson) format= kPearson;
1334  else if (foundZuker) format= kZuker;
1335  /* digit ident: */
1336  else if (foundPhylip) format= kPhylip;
1337  /* spacing ident: */
1338  else if (foundFitch) format= kFitch;
1339  /* no format chars: */
1340  else if (otherlines > 0) format= kUnknown;
1341  else if (dnalines > 1) format= kPlain;
1342  else if (aminolines > 1) format= kPlain;
1343  else format= kUnknown;
1344 
1345  done= true;
1346  }
1347 
1348  /* need this for possible long header in olsen format */
1349  else if (strstr(sp,"): ") != NULL)
1350  maxlines2check++;
1351  }
1352 
1353  if (format == kPhylip) {
1354  /* check for interleaved or sequential -- really messy */
1355  int tname, tseq;
1356  long i, j, nspp= 0, nlen= 0, ilen, leaf= 0, seq= 0;
1357  char *ps;
1358 
1359  rewind(fseq);
1360  for (i=0; i < *skiplines; i++) ReadOneLine(sp);
1361  nlines= 0;
1362  ReadOneLine(sp);
1363  sscanf( sp, "%ld%ld", &nspp, &nlen);
1364  ReadOneLine(sp); /* 1st seq line */
1365  for (ps= sp+10, ilen=0; *ps!=0; ps++) if (isprint(*ps)) ilen++;
1366 
1367  for (i= 1; i<nspp; i++) {
1368  ReadOneLine(sp);
1369 
1370  tseq= getseqtype(sp+10, strlen(sp+10));
1371  tname= getseqtype(sp, 10);
1372  for (j=0, ps= sp; isspace(*ps) && j<10; ps++, j++);
1373  for (ps= sp; *ps!=0; ps++) if (isprint(*ps)) ilen++;
1374 
1375  /* find probable interleaf or sequential ... */
1376  if (j>=9) seq += 10; /* pretty certain not ileaf */
1377  else {
1378  if (tseq != tname) leaf++; else seq++;
1379  if (tname == kDNA || tname == kRNA) seq++; else leaf++;
1380  }
1381 
1382  if (ilen <= nlen && j<9) {
1383  if (tname == kOtherSeq) leaf += 10;
1384  else if (tname == kAmino || tname == kDNA || tname == kRNA) seq++; else leaf++;
1385  }
1386  else if (ilen > nlen) {
1387  ilen= 0;
1388  }
1389  }
1390  for ( nspp *= 2 ; i<nspp; i++) { /* this should be only bases if interleaf */
1391  ReadOneLine(sp);
1392 
1393  tseq= getseqtype(sp+10, strlen(sp+10));
1394  tname= getseqtype(sp, 10);
1395  for (ps= sp; *ps!=0; ps++) if (isprint(*ps)) ilen++;
1396  for (j=0, ps= sp; isspace(*ps) && j<10; ps++, j++);
1397  if (j<9) {
1398  if (tname == kOtherSeq) seq += 10;
1399  if (tseq != tname) seq++; else leaf++;
1400  if (tname == kDNA || tname == kRNA) leaf++; else seq++;
1401  }
1402  if (ilen > nlen) {
1403  if (j>9) leaf += 10; /* must be a name here for sequent */
1404  else if (tname == kOtherSeq) seq += 10;
1405  ilen= 0;
1406  }
1407  }
1408 
1409  if (leaf > seq) format= kPhylip4;
1410  else format= kPhylip2;
1411  }
1412 
1413  return(format);
1414 #undef ReadOneLine
1415 } /* SeqFileFormat */
1416 
1417 
1418 
1419 
1420 unsigned long GCGchecksum( const char *seq, const long seqlen, unsigned long *checktotal)
1421 /* GCGchecksum */
1422 {
1423  long i, check = 0, count = 0;
1424 
1425  for (i = 0; i < seqlen; i++) {
1426  count++;
1427  check += count * to_upper(seq[i]);
1428  if (count == 57) count = 0;
1429  }
1430  check %= 10000;
1431  *checktotal += check;
1432  *checktotal %= 10000;
1433  return check;
1434 }
1435 
1436 /* Table of CRC-32's of all single byte values (made by makecrc.c of ZIP source) */
1437 const unsigned long crctab[] = {
1438  0x00000000L, 0x77073096L, 0xee0e612cL, 0x990951baL, 0x076dc419L,
1439  0x706af48fL, 0xe963a535L, 0x9e6495a3L, 0x0edb8832L, 0x79dcb8a4L,
1440  0xe0d5e91eL, 0x97d2d988L, 0x09b64c2bL, 0x7eb17cbdL, 0xe7b82d07L,
1441  0x90bf1d91L, 0x1db71064L, 0x6ab020f2L, 0xf3b97148L, 0x84be41deL,
1442  0x1adad47dL, 0x6ddde4ebL, 0xf4d4b551L, 0x83d385c7L, 0x136c9856L,
1443  0x646ba8c0L, 0xfd62f97aL, 0x8a65c9ecL, 0x14015c4fL, 0x63066cd9L,
1444  0xfa0f3d63L, 0x8d080df5L, 0x3b6e20c8L, 0x4c69105eL, 0xd56041e4L,
1445  0xa2677172L, 0x3c03e4d1L, 0x4b04d447L, 0xd20d85fdL, 0xa50ab56bL,
1446  0x35b5a8faL, 0x42b2986cL, 0xdbbbc9d6L, 0xacbcf940L, 0x32d86ce3L,
1447  0x45df5c75L, 0xdcd60dcfL, 0xabd13d59L, 0x26d930acL, 0x51de003aL,
1448  0xc8d75180L, 0xbfd06116L, 0x21b4f4b5L, 0x56b3c423L, 0xcfba9599L,
1449  0xb8bda50fL, 0x2802b89eL, 0x5f058808L, 0xc60cd9b2L, 0xb10be924L,
1450  0x2f6f7c87L, 0x58684c11L, 0xc1611dabL, 0xb6662d3dL, 0x76dc4190L,
1451  0x01db7106L, 0x98d220bcL, 0xefd5102aL, 0x71b18589L, 0x06b6b51fL,
1452  0x9fbfe4a5L, 0xe8b8d433L, 0x7807c9a2L, 0x0f00f934L, 0x9609a88eL,
1453  0xe10e9818L, 0x7f6a0dbbL, 0x086d3d2dL, 0x91646c97L, 0xe6635c01L,
1454  0x6b6b51f4L, 0x1c6c6162L, 0x856530d8L, 0xf262004eL, 0x6c0695edL,
1455  0x1b01a57bL, 0x8208f4c1L, 0xf50fc457L, 0x65b0d9c6L, 0x12b7e950L,
1456  0x8bbeb8eaL, 0xfcb9887cL, 0x62dd1ddfL, 0x15da2d49L, 0x8cd37cf3L,
1457  0xfbd44c65L, 0x4db26158L, 0x3ab551ceL, 0xa3bc0074L, 0xd4bb30e2L,
1458  0x4adfa541L, 0x3dd895d7L, 0xa4d1c46dL, 0xd3d6f4fbL, 0x4369e96aL,
1459  0x346ed9fcL, 0xad678846L, 0xda60b8d0L, 0x44042d73L, 0x33031de5L,
1460  0xaa0a4c5fL, 0xdd0d7cc9L, 0x5005713cL, 0x270241aaL, 0xbe0b1010L,
1461  0xc90c2086L, 0x5768b525L, 0x206f85b3L, 0xb966d409L, 0xce61e49fL,
1462  0x5edef90eL, 0x29d9c998L, 0xb0d09822L, 0xc7d7a8b4L, 0x59b33d17L,
1463  0x2eb40d81L, 0xb7bd5c3bL, 0xc0ba6cadL, 0xedb88320L, 0x9abfb3b6L,
1464  0x03b6e20cL, 0x74b1d29aL, 0xead54739L, 0x9dd277afL, 0x04db2615L,
1465  0x73dc1683L, 0xe3630b12L, 0x94643b84L, 0x0d6d6a3eL, 0x7a6a5aa8L,
1466  0xe40ecf0bL, 0x9309ff9dL, 0x0a00ae27L, 0x7d079eb1L, 0xf00f9344L,
1467  0x8708a3d2L, 0x1e01f268L, 0x6906c2feL, 0xf762575dL, 0x806567cbL,
1468  0x196c3671L, 0x6e6b06e7L, 0xfed41b76L, 0x89d32be0L, 0x10da7a5aL,
1469  0x67dd4accL, 0xf9b9df6fL, 0x8ebeeff9L, 0x17b7be43L, 0x60b08ed5L,
1470  0xd6d6a3e8L, 0xa1d1937eL, 0x38d8c2c4L, 0x4fdff252L, 0xd1bb67f1L,
1471  0xa6bc5767L, 0x3fb506ddL, 0x48b2364bL, 0xd80d2bdaL, 0xaf0a1b4cL,
1472  0x36034af6L, 0x41047a60L, 0xdf60efc3L, 0xa867df55L, 0x316e8eefL,
1473  0x4669be79L, 0xcb61b38cL, 0xbc66831aL, 0x256fd2a0L, 0x5268e236L,
1474  0xcc0c7795L, 0xbb0b4703L, 0x220216b9L, 0x5505262fL, 0xc5ba3bbeL,
1475  0xb2bd0b28L, 0x2bb45a92L, 0x5cb36a04L, 0xc2d7ffa7L, 0xb5d0cf31L,
1476  0x2cd99e8bL, 0x5bdeae1dL, 0x9b64c2b0L, 0xec63f226L, 0x756aa39cL,
1477  0x026d930aL, 0x9c0906a9L, 0xeb0e363fL, 0x72076785L, 0x05005713L,
1478  0x95bf4a82L, 0xe2b87a14L, 0x7bb12baeL, 0x0cb61b38L, 0x92d28e9bL,
1479  0xe5d5be0dL, 0x7cdcefb7L, 0x0bdbdf21L, 0x86d3d2d4L, 0xf1d4e242L,
1480  0x68ddb3f8L, 0x1fda836eL, 0x81be16cdL, 0xf6b9265bL, 0x6fb077e1L,
1481  0x18b74777L, 0x88085ae6L, 0xff0f6a70L, 0x66063bcaL, 0x11010b5cL,
1482  0x8f659effL, 0xf862ae69L, 0x616bffd3L, 0x166ccf45L, 0xa00ae278L,
1483  0xd70dd2eeL, 0x4e048354L, 0x3903b3c2L, 0xa7672661L, 0xd06016f7L,
1484  0x4969474dL, 0x3e6e77dbL, 0xaed16a4aL, 0xd9d65adcL, 0x40df0b66L,
1485  0x37d83bf0L, 0xa9bcae53L, 0xdebb9ec5L, 0x47b2cf7fL, 0x30b5ffe9L,
1486  0xbdbdf21cL, 0xcabac28aL, 0x53b39330L, 0x24b4a3a6L, 0xbad03605L,
1487  0xcdd70693L, 0x54de5729L, 0x23d967bfL, 0xb3667a2eL, 0xc4614ab8L,
1488  0x5d681b02L, 0x2a6f2b94L, 0xb40bbe37L, 0xc30c8ea1L, 0x5a05df1bL,
1489  0x2d02ef8dL
1490 };
1491 
1492 unsigned long CRC32checksum(const char *seq, const long seqlen, unsigned long *checktotal)
1493 /*CRC32checksum: modified from CRC-32 algorithm found in ZIP compression source */
1494 {
1495  unsigned long c = 0xffffffffL;
1496  long n = seqlen;
1497 
1498  if (!seq) return 0;
1499  while (n--) {
1500  c = crctab[((int)c ^ (to_upper(*seq))) & 0xff] ^ (c >> 8);
1501  seq++; /* fixed aug'98 finally */
1502  }
1503  c= c ^ 0xffffffffL;
1504  *checktotal += c;
1505  return c;
1506 }
1507 
1508 
1509 
1510 
1511 short getseqtype( const char *seq, const long seqlen)
1512 { /* return sequence kind: kDNA, kRNA, kProtein, kOtherSeq, ??? */
1513  char c;
1514  short i, maxtest;
1515  short na = 0, aa = 0, po = 0, nt = 0, nu = 0, ns = 0, no = 0;
1516 
1517  maxtest = min(300, seqlen);
1518  for (i = 0; i < maxtest; i++) {
1519  c = to_upper(seq[i]);
1520  if (strchr(protonly, c)) po++;
1521  else if (strchr(primenuc,c)) {
1522  na++;
1523  if (c == 'T') nt++;
1524  else if (c == 'U') nu++;
1525  }
1526  else if (strchr(aminos,c)) aa++;
1527  else if (strchr(seqsymbols,c)) ns++;
1528  else if (isalpha(c)) no++;
1529  }
1530 
1531  if ((no > 0) || (po+aa+na == 0)) return kOtherSeq;
1532  /* ?? test for probability of kOtherSeq ?, e.g.,
1533  else if (po+aa+na / maxtest < 0.70) return kOtherSeq;
1534  */
1535  else if (po > 0) return kAmino;
1536  else if (aa == 0) {
1537  if (nu > nt) return kRNA;
1538  else return kDNA;
1539  }
1540  else if (na > aa) return kNucleic;
1541  else return kAmino;
1542 } /* getseqtype */
1543 
1544 
1545 char* compressSeq( const char gapc, const char *seq, const long seqlen, long *newlen)
1546 {
1547  char *a, *b;
1548  long i;
1549  char *newseq;
1550 
1551  *newlen= 0;
1552  if (!seq) return NULL;
1553  newseq = (char*) malloc(seqlen+1);
1554  if (!newseq) return NULL;
1555  for (a= (char*)seq, b=newseq, i=0; *a!=0; a++)
1556  if (*a != gapc) {
1557  *b++= *a;
1558  i++;
1559  }
1560  *b= '\0';
1561  newseq = (char*) realloc(newseq, i+1);
1562  *newlen= i;
1563  return newseq;
1564 }
1565 
1566 
1567 
1568 /***
1569 char *rtfhead = "{\\rtf1\\defformat\\mac\\deff2 \
1570 {\\fonttbl\
1571  {\\f1\\fmodern Courier;}{\\f2\\fmodern Monaco;}\
1572  {\\f3\\fswiss Helvetica;}{\\f4\\fswiss Geneva;}\
1573  {\\f5\\froman Times;}{\\f6\\froman Palatino;}\
1574  {\\f7\\froman New Century Schlbk;}{\\f8\\ftech Symbol;}}\
1575 {\\stylesheet\
1576  {\\s1 \\f5\\fs20 \\sbasedon0\\snext1 name;}\
1577  {\\s2 \\f3\\fs20 \\sbasedon0\\snext2 num;}\
1578  {\\s3 \\f1\\f21 \\sbasedon0\\snext3 seq;}}";
1579 
1580 char *rtftail = "}";
1581 ****/
1582 
1583 short writeSeq(FILE *outf, const char *seq, const long seqlen,
1584  const short outform, const char *seqid)
1585 /* dump sequence to standard output */
1586 {
1587  const short kSpaceAll = -9;
1588 #define kMaxseqwidth 250
1589 
1590  boolean baseonlynum= false; /* nocountsymbols -- only count true bases, not "-" */
1591  short numline = 0; /* only true if we are writing seq number line (for interleave) */
1592  boolean numright = false, numleft = false;
1593  boolean nameright = false, nameleft = false;
1594  short namewidth = 8, numwidth = 8;
1595  short spacer = 0, width = 50, tab = 0;
1596  /* new parameters: width, spacer, those above... */
1597 
1598  short linesout = 0, seqtype = kNucleic;
1599  long i, j, l, l1, ibase;
1600  char idword[31], endstr[14];
1601  char seqnamestore[128], *seqname = seqnamestore;
1602  char s[kMaxseqwidth], *cp;
1603  char nameform[10], numform[10], nocountsymbols[10];
1604  unsigned long checksum = 0, checktotal = 0;
1605 
1606  gPretty.atseq++;
1607  skipwhitespace(seqid);
1608  l = min(128, strlen(seqid));
1609  strncpy( seqnamestore, seqid, l);
1610  seqname[l] = 0;
1611 
1612  sscanf( seqname, "%30s", idword);
1613  sprintf(numform, "%ld", seqlen);
1614  numwidth= strlen(numform)+1;
1615  nameform[0]= '\0';
1616 
1617  if (strstr(seqname,"checksum") != NULL) {
1618  cp = strstr(seqname,"bases");
1619  if (cp!=NULL) {
1620  for ( ; (cp!=seqname) && (*cp!=','); cp--) ;
1621  if (cp!=seqname) *cp=0;
1622  }
1623  }
1624 
1625  strcpy( endstr,"");
1626  l1 = 0;
1627 
1628  if (outform == kGCG || outform == kMSF)
1629  checksum = GCGchecksum(seq, seqlen, &checktotal);
1630  else
1631  checksum = seqchecksum(seq, seqlen, &checktotal);
1632 
1633  switch (outform) {
1634 
1635  case kPlain:
1636  case kUnknown: /* no header, just sequence */
1637  strcpy(endstr,"\n"); /* end w/ extra blank line */
1638  break;
1639 
1640  case kOlsen: /* Olsen seq. editor takes plain nucs OR Genbank */
1641  case kGenBank:
1642  fprintf(outf,"LOCUS %s %ld bp\n", idword, seqlen);
1643  fprintf(outf,"DEFINITION %s, %ld bases, %lX checksum.\n", seqname, seqlen, checksum);
1644  /* fprintf(outf,"ACCESSION %s\n", accnum); */
1645  fprintf(outf,"ORIGIN \n");
1646  spacer = 11;
1647  numleft = true;
1648  numwidth = 8; /* dgg. 1Feb93, patch for GDE fail to read short numwidth */
1649  strcpy(endstr, "\n//");
1650  linesout += 4;
1651  break;
1652 
1653  case kPIR:
1654  /* somewhat like genbank... \\\*/
1655  /* fprintf(outf,"\\\\\\\n"); << only at top of file, not each entry... */
1656  fprintf(outf,"ENTRY %s \n", idword);
1657  fprintf(outf,"TITLE %s, %ld bases, %lX checksum.\n", seqname, seqlen, checksum);
1658  /* fprintf(outf,"ACCESSION %s\n", accnum); */
1659  fprintf(outf,"SEQUENCE \n");
1660  numwidth = 7;
1661  width= 30;
1662  spacer = kSpaceAll;
1663  numleft = true;
1664  strcpy(endstr, "\n///");
1665  /* run a top number line for PIR */
1666  for (j=0; j<numwidth; j++) fputc(' ',outf);
1667  for (j= 5; j<=width; j += 5) fprintf(outf,"%10ld",j);
1668  fputc('\n',outf);
1669  linesout += 5;
1670  break;
1671 
1672  case kNBRF:
1673  if (getseqtype(seq, seqlen) == kAmino)
1674  fprintf(outf,">P1;%s\n", idword);
1675  else
1676  fprintf(outf,">DL;%s\n", idword);
1677  fprintf(outf,"%s, %ld bases, %lX checksum.\n", seqname, seqlen, checksum);
1678  spacer = 11;
1679  strcpy(endstr,"*\n");
1680  linesout += 3;
1681  break;
1682 
1683  case kEMBL:
1684  fprintf(outf,"ID %s\n", idword);
1685  /* fprintf(outf,"AC %s\n", accnum); */
1686  fprintf(outf,"DE %s, %ld bases, %lX checksum.\n", seqname, seqlen, checksum);
1687  fprintf(outf,"SQ %ld BP\n", seqlen);
1688  strcpy(endstr, "\n//"); /* 11Oct90: bug fix*/
1689  tab = 4;
1690  spacer = 11;
1691  width = 60;
1692  linesout += 4;
1693  break;
1694 
1695  case kGCG:
1696  fprintf(outf,"%s\n", seqname);
1697  /* fprintf(outf,"ACCESSION %s\n", accnum); */
1698  fprintf(outf," %s Length: %ld (today) Check: %ld ..\n", idword, seqlen, checksum);
1699  spacer = 11;
1700  numleft = true;
1701  strcpy(endstr, "\n"); /* this is insurance to help prevent misreads at eof */
1702  linesout += 3;
1703  break;
1704 
1705  case kStrider: /* ?? map ?*/
1706  fprintf(outf,"; ### from DNA Strider ;-)\n");
1707  fprintf(outf,"; DNA sequence %s, %ld bases, %lX checksum.\n;\n", seqname, seqlen, checksum);
1708  strcpy(endstr, "\n//");
1709  linesout += 3;
1710  break;
1711 
1712  case kFitch:
1713  fprintf(outf,"%s, %ld bases, %lX checksum.\n", seqname, seqlen, checksum);
1714  spacer = 4;
1715  width = 60;
1716  linesout += 1;
1717  break;
1718 
1719  case kPhylip2:
1720  case kPhylip4:
1721  /* this is version 3.2/3.4 -- simplest way to write
1722  version 3.3 is to write as version 3.2, then
1723  re-read file and interleave the species lines */
1724  if (strlen(idword)>10) idword[10] = 0;
1725  fprintf(outf,"%-10s ",idword);
1726  l1 = -1;
1727  tab = 12;
1728  spacer = 11;
1729  break;
1730 
1731  case kASN1:
1732  seqtype= getseqtype(seq, seqlen);
1733  switch (seqtype) {
1734  case kDNA : cp= (char*)"dna"; break;
1735  case kRNA : cp= (char*)"rna"; break;
1736  case kNucleic : cp= (char*)"na"; break;
1737  case kAmino : cp= (char*)"aa"; break;
1738  case kOtherSeq: cp= (char*)"not-set"; break;
1739  }
1740  fprintf(outf," seq {\n");
1741  fprintf(outf," id { local id %d },\n", gPretty.atseq);
1742  fprintf(outf," descr { title \"%s\" },\n", seqid);
1743  fprintf(outf," inst {\n");
1744  fprintf(outf," repr raw, mol %s, length %ld, topology linear,\n", cp, seqlen);
1745  fprintf(outf," seq-data\n");
1746  if (seqtype == kAmino)
1747  fprintf(outf," iupacaa \"");
1748  else
1749  fprintf(outf," iupacna \"");
1750  l1 = 17;
1751  spacer = 0;
1752  width = 78;
1753  tab = 0;
1754  strcpy(endstr,"\"\n } } ,");
1755  linesout += 7;
1756  break;
1757 
1758  case kPAUP:
1759  nameleft= true;
1760  namewidth = 9;
1761  spacer = 21;
1762  width = 100;
1763  tab = 0; /* 1; */
1764  /* strcpy(endstr,";\nend;"); << this is end of all seqs.. */
1765  /* do a header comment line for paup */
1766  fprintf(outf,"[Name: %-16s Len:%6ld Check: %8lX]\n", idword, seqlen, checksum);
1767  linesout += 1;
1768  break;
1769 
1770  case kPretty:
1771  numline= gPretty.numline;
1772  baseonlynum= gPretty.baseonlynum;
1773  namewidth = gPretty.namewidth;
1774  numright = gPretty.numright;
1775  numleft = gPretty.numleft;
1776  nameright = gPretty.nameright;
1777  nameleft = gPretty.nameleft;
1778  spacer = gPretty.spacer + 1;
1779  width = gPretty.seqwidth;
1780  tab = gPretty.tab;
1781  /* also add rtf formatting w/ font, size, style */
1782  if (gPretty.nametop) {
1783  fprintf(outf,"Name: %-16s Len:%6ld Check: %8lX\n", idword, seqlen, checksum);
1784  linesout++;
1785  }
1786  break;
1787 
1788  case kMSF:
1789  fprintf(outf," Name: %-16s Len:%6ld Check: %5ld Weight: 1.00\n",
1790  idword, seqlen, checksum);
1791  linesout++;
1792  nameleft= true;
1793  namewidth= 15; /* need MAX namewidth here... */
1794  sprintf(nameform, "%%+%ds ",namewidth);
1795  spacer = 11;
1796  width = 50;
1797  tab = 0; /* 1; */
1798  break;
1799 
1800  case kIG:
1801  fprintf(outf,";%s, %ld bases, %lX checksum.\n", seqname, seqlen, checksum);
1802  fprintf(outf,"%s\n", idword);
1803  strcpy(endstr,"1"); /* == linear dna */
1804  linesout += 2;
1805  break;
1806 
1807  default :
1808  case kZuker: /* don't attempt Zuker's ftn format */
1809  case kPearson:
1810  fprintf(outf,">%s, %ld bases, %lX checksum.\n", seqname, seqlen, checksum);
1811  linesout += 1;
1812  break;
1813  }
1814 
1815  if (*nameform==0) sprintf(nameform, "%%%d.%ds ",namewidth,namewidth);
1816  if (numline) sprintf(numform, "%%%ds ",numwidth);
1817  else sprintf(numform, "%%%dd ",numwidth);
1818  strcpy( nocountsymbols, kNocountsymbols);
1819  if (baseonlynum) {
1820  if (strchr(nocountsymbols,gPretty.gapchar)==NULL) {
1821  strcat(nocountsymbols," ");
1822  nocountsymbols[strlen(nocountsymbols)-1]= gPretty.gapchar;
1823  }
1824  if (gPretty.domatch && (cp=strchr(nocountsymbols,gPretty.matchchar))!=NULL) {
1825  *cp= ' ';
1826  }
1827  }
1828 
1829  if (numline) {
1830  *idword= 0;
1831  }
1832 
1833  width = min(width,kMaxseqwidth);
1834  for (i=0, l=0, ibase = 1; i < seqlen; ) {
1835 
1836  if (l1 < 0) l1 = 0;
1837  else if (l1 == 0) {
1838  if (nameleft) fprintf(outf, nameform, idword);
1839  if (numleft) { if (numline) fprintf(outf, numform, "");
1840  else fprintf(outf, numform, ibase);}
1841  for (j=0; j<tab; j++) fputc(' ',outf);
1842  }
1843 
1844  l1++; /* don't count spaces for width*/
1845  if (numline) {
1846  if (spacer==kSpaceAll || (spacer != 0 && (l+1) % spacer == 1)) {
1847  if (numline==1) fputc(' ',outf);
1848  s[l++] = ' ';
1849  }
1850  if (l1 % 10 == 1 || l1 == width) {
1851  if (numline==1) fprintf(outf,"%-9ld ",i+1);
1852  s[l++]= '|'; /* == put a number here */
1853  }
1854  else s[l++]= ' ';
1855  i++;
1856  }
1857 
1858  else {
1859  if (spacer==kSpaceAll || (spacer != 0 && (l+1) % spacer == 1))
1860  s[l++] = ' ';
1861  if (!baseonlynum) ibase++;
1862  else if (0==strchr(nocountsymbols,seq[i])) ibase++;
1863  s[l++] = seq[i++];
1864  }
1865 
1866  if (l1 == width || i == seqlen) {
1867  if (outform==kPretty) for ( ; l1<width; l1++) {
1868  if (spacer==kSpaceAll || (spacer != 0 && (l+1) % spacer == 1))
1869  s[l++] = ' ';
1870  s[l++]=' '; /* pad w/ blanks */
1871  }
1872  s[l] = '\0';
1873  l = 0; l1 = 0;
1874 
1875  if (numline) {
1876  if (numline==2) fprintf(outf,"%s",s); /* finish numberline ! and | */
1877  }
1878  else {
1879  if (i == seqlen) fprintf(outf,"%s%s",s,endstr);
1880  else fprintf(outf,"%s",s);
1881  if (numright || nameright) fputc(' ',outf);
1882  if (numright) fprintf(outf,numform, ibase-1);
1883  if (nameright) fprintf(outf, nameform,idword);
1884  }
1885  fputc('\n',outf);
1886  linesout++;
1887  }
1888  }
1889  return linesout;
1890 } /*writeSeq*/
1891 
1892 
1893 
1894 /* End file: ureadseq.c */
#define Local
Definition: ureadseq.c:70
char * readSeqFp(const short whichEntry_, FILE *fp_, const long skiplines_, const short format_, long *seqlen_, short *nseq_, short *error_, char *seqid_)
Definition: ureadseq.c:1033
#define skipwhitespace(string)
Definition: ureadseq.h:17
#define kZuker
Definition: ureadseq.h:58
long topseqlen
Definition: ureadseq.c:100
long linestart
Definition: ureadseq.c:105
char * seq
Definition: ureadseq.c:102
#define kASN1
Definition: ureadseq.h:67
boolean baseonlynum
Definition: ureadseq.h:85
static GB_ERROR tab(GBL_command_arguments *args, bool pretab)
Definition: adlang1.cxx:914
AliDataPtr format(AliDataPtr data, const size_t wanted_len, GB_ERROR &error)
Definition: insdel.cxx:615
int(* isseqcharfirst8)()
Definition: ureadseq.c:108
const unsigned long crctab[]
Definition: ureadseq.c:1437
#define kPhylip
Definition: ureadseq.h:63
#define tolowerstr(s)
Local void readEMBL(struct ReadSeqVars *V)
Definition: ureadseq.c:424
char matchchar
Definition: ureadseq.h:89
char * sp
Definition: ureadseq.c:106
Local void readOlsen(struct ReadSeqVars *V)
Definition: ureadseq.c:539
#define kStartLength
Definition: ureadseq.c:73
boolean numleft
Definition: ureadseq.h:86
#define kOlsen
Definition: ureadseq.h:59
Local void readMSF(struct ReadSeqVars *V)
Definition: ureadseq.c:622
Local boolean endPearson(boolean *addend, boolean *ungetend, struct ReadSeqVars *V)
Definition: ureadseq.c:395
Local boolean endFitch(boolean *addend, boolean *ungetend, struct ReadSeqVars *V)
Definition: ureadseq.c:470
boolean nameleft
Definition: ureadseq.h:87
Local void addseq(char *s, struct ReadSeqVars *V)
Definition: ureadseq.c:155
Local void readLoop(short margin, boolean addfirst, boolean(*endTest)(boolean *addend, boolean *ungetend, struct ReadSeqVars *V), struct ReadSeqVars *V)
Definition: ureadseq.c:214
#define fixmatchchar(s)
Local void readPAUPinterleaved(struct ReadSeqVars *V)
Definition: ureadseq.c:685
int isSeqChar(int c)
Definition: ureadseq.c:115
#define kAmino
Definition: ureadseq.h:47
const char * fname
Definition: ureadseq.c:101
Local void addinfo(char *s, struct ReadSeqVars *V)
Definition: ureadseq.c:194
Local void readline(FILE *f, char *s, long *linestart)
Definition: ureadseq.c:131
char gapchar
Definition: ureadseq.h:89
boolean domatch
Definition: ureadseq.h:88
char * compressSeq(const char gapc, const char *seq, const long seqlen, long *newlen)
Definition: ureadseq.c:1545
#define eMemFull
Definition: ureadseq.h:29
#define kUnknown
Definition: ureadseq.h:74
Local void readPhylipSequential(struct ReadSeqVars *V)
Definition: ureadseq.c:912
boolean filestart
Definition: ureadseq.c:103
#define kPhylip2
Definition: ureadseq.h:60
#define kRNA
Definition: ureadseq.h:45
int Strncasecmp(const char *a, const char *b, long maxn)
Definition: ureadseq.c:48
const char * aminos
Definition: ureadseq.c:75
short err
Definition: ureadseq.c:97
prettyopts gPretty
Definition: ureadseq.c:31
int Strcasecmp(const char *a, const char *b)
Definition: ureadseq.c:34
const char stdsymbols[6]
Definition: ureadseq.c:80
long maxseq
Definition: ureadseq.c:98
#define kASNseqset
Definition: ureadseq.h:78
FILE * seq
Definition: rns.c:46
char * listSeqs(const char *filename_, const long skiplines_, const short format_, short *nseq_, short *error_)
Definition: ureadseq.c:1134
#define eNoData
Definition: ureadseq.h:28
const char nonummask[11]
Definition: ureadseq.c:85
static const char * seqsymbols
Definition: ureadseq.c:82
Local void readStrider(struct ReadSeqVars *V)
Definition: ureadseq.c:281
Local void countseq(char *s, struct ReadSeqVars *V)
Definition: ureadseq.c:181
Local boolean endIG(boolean *addend, boolean *ungetend, struct ReadSeqVars *V)
Definition: ureadseq.c:245
Local void readZuker(struct ReadSeqVars *V)
Definition: ureadseq.c:451
#define kMSF
Definition: ureadseq.h:66
short namewidth
Definition: ureadseq.h:91
#define kNBRF
Definition: ureadseq.h:52
static int diff(int v1, int v2, int v3, int v4, int st, int en)
Definition: ClustalV.cxx:534
#define kPlain
Definition: ureadseq.h:64
#define kStrider
Definition: ureadseq.h:55
long seqlencount
Definition: ureadseq.c:98
Local void readNBRF(struct ReadSeqVars *V)
Definition: ureadseq.c:379
FILE * f
Definition: ureadseq.c:104
Local void readIG(struct ReadSeqVars *V)
Definition: ureadseq.c:252
#define eFileNotFound
Definition: ureadseq.h:27
boolean allDone
Definition: ureadseq.c:103
Local void readPIR(struct ReadSeqVars *V)
Definition: ureadseq.c:306
Local void readSeqMain(struct ReadSeqVars *V, const long skiplines_, const short format_)
Definition: ureadseq.c:937
Local boolean endPIR(boolean *addend, boolean *ungetend, struct ReadSeqVars *V)
Definition: ureadseq.c:299
#define kNoformat
Definition: ureadseq.h:73
#define kPAUP
Definition: ureadseq.h:68
boolean nameright
Definition: ureadseq.h:87
const char kNocountsymbols[5]
Definition: ureadseq.c:79
short seqwidth
Definition: ureadseq.h:92
Local boolean endPhylipSequential(boolean *addend, boolean *ungetend, struct ReadSeqVars *V)
Definition: ureadseq.c:904
static void error(const char *msg)
Definition: mkptypes.cxx:96
fputc('\n', stderr)
#define kGenBank
Definition: ureadseq.h:51
short choice
Definition: ureadseq.c:97
short atseq
Definition: ureadseq.h:90
#define kIG
Definition: ureadseq.h:50
boolean interleaved
Definition: readseq.c:463
char s[256]
Definition: ureadseq.c:106
Local void readFitch(struct ReadSeqVars *V)
Definition: ureadseq.c:480
#define kASNseqentry
Definition: ureadseq.h:77
long outform
Definition: readseq.c:466
#define kNucleic
Definition: ureadseq.h:46
#define kMaxseqwidth
#define kGCG
Definition: ureadseq.h:54
Local boolean endStrider(boolean *addend, boolean *ungetend, struct ReadSeqVars *V)
Definition: ureadseq.c:274
char * seqid
Definition: ureadseq.c:102
boolean nametop
Definition: ureadseq.h:87
Local boolean endZuker(boolean *addend, boolean *ungetend, struct ReadSeqVars *V)
Definition: ureadseq.c:444
const char nummask[11]
Definition: ureadseq.c:84
boolean addit
Definition: ureadseq.c:103
#define kFitch
Definition: ureadseq.h:56
Local void readPAUPsequential(struct ReadSeqVars *V)
Definition: ureadseq.c:774
#define kOtherSeq
Definition: ureadseq.h:43
short writeSeq(FILE *outf, const char *seq, const long seqlen, const short outform, const char *seqid)
Definition: ureadseq.c:1583
#define kEMBL
Definition: ureadseq.h:53
unsigned long CRC32checksum(const char *seq, const long seqlen, unsigned long *checktotal)
Definition: ureadseq.c:1492
int(* isseqchar)()
Definition: ureadseq.c:109
boolean done
Definition: ureadseq.c:103
int isAnyChar(int c)
Definition: ureadseq.c:126
#define to_upper(c)
Definition: ureadseq.h:23
#define kListSequences
Definition: ureadseq.h:40
#define eUnknownFormat
Definition: ureadseq.h:34
Local void readGenBank(struct ReadSeqVars *V)
Definition: ureadseq.c:334
#define kPhylip4
Definition: ureadseq.h:61
Local void GetLine(struct ReadSeqVars *V)
Definition: ureadseq.c:144
#define kDNA
Definition: ureadseq.h:44
int isSeqNumChar(int c)
Definition: ureadseq.c:120
short topnseq
Definition: ureadseq.c:99
char(& no)[2]
Definition: downcast.h:31
Local void readPlain(struct ReadSeqVars *V)
Definition: ureadseq.c:494
short seqFileFormatFp(FILE *fseq, long *skiplines, short *error)
Definition: ureadseq.c:1165
#define seqchecksum
Definition: ureadseq.h:140
short seqFileFormat(const char *filename, long *skiplines, short *error)
Definition: ureadseq.c:1151
short spacer
Definition: ureadseq.h:92
unsigned long GCGchecksum(const char *seq, const long seqlen, unsigned long *checktotal)
Definition: ureadseq.c:1420
const char * protonly
Definition: ureadseq.c:77
Local void unGetLine(struct ReadSeqVars *V)
Definition: ureadseq.c:149
Local boolean endGB(boolean *addend, boolean *ungetend, struct ReadSeqVars *V)
Definition: ureadseq.c:327
Local void readPearson(struct ReadSeqVars *V)
Definition: ureadseq.c:402
#define kPearson
Definition: ureadseq.h:57
short nseq
Definition: ureadseq.c:97
const char allsymbols[32]
Definition: ureadseq.c:81
#define kMaxFormat
Definition: ureadseq.h:71
#define ReadOneLine(sp)
Local boolean endNBRF(boolean *addend, boolean *ungetend, struct ReadSeqVars *V)
Definition: ureadseq.c:359
Local void readPhylipInterleaved(struct ReadSeqVars *V)
Definition: ureadseq.c:851
long seqlen
Definition: ureadseq.c:98
char * readSeq(const short whichEntry_, const char *filename_, const long skiplines_, const short format_, long *seqlen_, short *nseq_, short *error_, char *seqid_)
Definition: ureadseq.c:1082
#define min(a, b)
Definition: f2c.h:153
short getseqtype(const char *seq, const long seqlen)
Definition: ureadseq.c:1511
#define kPretty
Definition: ureadseq.h:69
char matchchar
Definition: ureadseq.c:102
#define kPIR
Definition: ureadseq.h:65
Local boolean endEMBL(boolean *addend, boolean *ungetend, struct ReadSeqVars *V)
Definition: ureadseq.c:417
short numline
Definition: ureadseq.h:90
const char * primenuc
Definition: ureadseq.c:76
Local void readUWGCG(struct ReadSeqVars *V)
Definition: ureadseq.c:512
short tab
Definition: ureadseq.h:92
boolean numright
Definition: ureadseq.h:86
GB_write_int const char s
Definition: AW_awar.cxx:154