ARB
ureadseq.c
Go to the documentation of this file.
1 /* File: ureadseq.c
2  *
3  * Reads and writes nucleic/protein sequence in various
4  * formats. Data files may have multiple sequences.
5  *
6  * Copyright 1990 by d.g.gilbert
7  * biology dept., indiana university, bloomington, in 47405
8  * e-mail: gilbertd@bio.indiana.edu
9  *
10  * This program may be freely copied and used by anyone.
11  * Developers are encourged to incorporate parts in their
12  * programs, rather than devise their own private sequence
13  * format.
14  *
15  * This should compile and run with any ANSI C compiler.
16  *
17  * ------------------------------------------------------------
18  *
19  * Some slight modifications were done by ARB developers
20  *
21  */
22 
23 
24 #include <stdio.h>
25 #include <stdlib.h>
26 #include <string.h>
27 #define __NO_CTYPE
28 #include <ctype.h>
29 
30 #include "ureadseq.h"
32 
33 #pragma segment ureadseq
34 
35 
36 int Strcasecmp(const char *a, const char *b) /* from Nlm_StrICmp */
37 {
38  int diff, done;
39  if (a == b) return 0;
40  done = 0;
41  while (! done) {
42  diff = to_upper(*a) - to_upper(*b);
43  if (diff) return diff;
44  if (*a == '\0') done = 1;
45  else { a++; b++; }
46  }
47  return 0;
48 }
49 
50 int Strncasecmp(const char *a, const char *b, long maxn) /* from Nlm_StrNICmp */
51 {
52  int diff, done;
53  if (a == b) return 0;
54  done = 0;
55  while (! done) {
56  diff = to_upper(*a) - to_upper(*b);
57  if (diff) return diff;
58  if (*a == '\0') done = 1;
59  else {
60  a++; b++; maxn--;
61  if (! maxn) done = 1;
62  }
63  }
64  return 0;
65 }
66 
67 
68 
69 
70 
71 #ifndef Local
72 # define Local static /* local functions */
73 #endif
74 
75 #define kStartLength 500000 /* 20Apr93 temp. bug fix */
76 
77 const char *aminos = "ABCDEFGHIKLMNPQRSTVWXYZ*";
78 const char *primenuc = "ACGTU";
79 const char *protonly = "EFIPQZ";
80 
81 const char kNocountsymbols[5] = "_.-?";
82 const char stdsymbols[6] = "_.-*?";
83 const char allsymbols[32] = "_.-*?<>{}[]()!@#$%^&=+;:'/|`~\"\\";
84 static const char *seqsymbols = allsymbols;
85 
86 const char nummask[11] = "0123456789";
87 const char nonummask[11] = "~!@#$%^&*(";
88 
89 /*
90  use general form of isseqchar -- all chars + symbols.
91  no formats except nbrf (?) use symbols in data area as
92  anything other than sequence chars.
93 */
94 
95 
96 
97  /* Local variables for readSeq: */
98 struct ReadSeqVars {
99  short choice, err, nseq;
101  short topnseq;
102  long topseqlen;
103  const char *fname;
104  char *seq, *seqid, matchchar;
106  FILE *f;
107  long linestart;
108  char s[256], *sp;
109 
110  int (*isseqcharfirst8)(); /* Patch by o. strunk (ARB) to allow numbers in genbank sequences*/
112  /* int (*isseqchar)(int c); << sgi cc hates (int c) */
113 };
114 
115 
116 
117 int isSeqChar(int c)
118 {
119  return (isalpha(c) || strchr(seqsymbols,c));
120 }
121 
122 int isSeqNumChar(int c)
123 {
124  return (isalnum(c) || strchr(seqsymbols,c));
125 }
126 
127 
128 int isAnyChar(int c)
129 {
130  return isascii(c); /* wrap in case isascii is macro */
131 }
132 
133 Local void readline(FILE *f, char *s, long *linestart)
134 {
135  char *cp;
136 
137  *linestart= ftell(f);
138  if (NULL == fgets(s, 256, f))
139  *s = 0;
140  else {
141  cp = strchr(s, '\n');
142  if (cp != NULL) *cp = 0;
143  }
144 }
145 
146 Local void GetLine(struct ReadSeqVars *V)
147 {
148  readline(V->f, V->s, &V->linestart);
149 }
150 
151 Local void unGetLine(struct ReadSeqVars *V)
152 {
153  fseek(V->f, V->linestart, 0);
154 }
155 
156 
157 Local void addseq(char *s, struct ReadSeqVars * V)
158 {
159  char *ptr;
160  int count = 0;
161  if (V->addit) {
162  for (;*s != 0;s++,count++) {
163  if (count < 9 && V->isseqcharfirst8) {
164  if (!(V->isseqcharfirst8) (*s)) continue;
165  }
166  else {
167  if (!(V->isseqchar) (*s)) continue;
168  }
169  if (V->seqlen >= V->maxseq) {
170  V->maxseq += kStartLength;
171  ptr = (char *) realloc(V->seq, V->maxseq + 1);
172  if (ptr == NULL) {
173  V->err = eMemFull;
174  return;
175  } else
176  V->seq = ptr;
177  }
178  V->seq[(V->seqlen)++] = *s;
179  }
180  }
181 }
182 
183 Local void countseq(char *s, struct ReadSeqVars *V)
184  /* this must count all valid seq chars, for some formats (paup-sequential) even
185  if we are skipping seq... */
186 {
187  while (*s != 0) {
188  if ((V->isseqchar)(*s)) {
189  (V->seqlencount)++;
190  }
191  s++;
192  }
193 }
194 
195 
196 Local void addinfo(char *s, struct ReadSeqVars *V)
197 {
198  char s2[256], *si;
199  boolean saveadd;
200 
201  si = s2;
202  while (*s == ' ') s++;
203  sprintf(si, " %d) %s\n", V->nseq, s);
204 
205  saveadd = V->addit;
206  V->addit = true;
207  V->isseqchar = isAnyChar;
208  addseq( si, V);
209  V->addit = saveadd;
210  V->isseqchar = isSeqChar;
211 }
212 
213 
214 
215 
216 Local void readLoop(short margin, boolean addfirst,
217  boolean (*endTest)(boolean *addend, boolean *ungetend, struct ReadSeqVars *V),
218  struct ReadSeqVars *V)
219 {
220  boolean addend = false;
221  boolean ungetend = false;
222 
223  V->nseq++;
224  if (V->choice == kListSequences) V->addit = false;
225  else V->addit = (V->nseq == V->choice);
226  if (V->addit) V->seqlen = 0;
227 
228  if (addfirst) addseq(V->s, V);
229  do {
230  GetLine(V);
231  V->done = feof(V->f);
232  V->done |= (*endTest)( &addend, &ungetend, V);
233  if (V->addit && (addend || !V->done) && (strlen(V->s) > (unsigned)margin)) {
234  addseq( (V->s)+margin, V);
235  }
236  } while (!V->done);
237 
238  if (V->choice == kListSequences) addinfo(V->seqid, V);
239  else {
240  V->allDone = (V->nseq >= V->choice);
241  if (V->allDone && ungetend) unGetLine(V);
242  }
243 }
244 
245 
246 
247 Local boolean endIG( boolean *addend, boolean *ungetend, struct ReadSeqVars *V)
248 {
249  *addend = true; /* 1 or 2 occur in line w/ bases */
250  *ungetend= false;
251  return((strchr(V->s,'1')!=NULL) || (strchr(V->s,'2')!=NULL));
252 }
253 
254 Local void readIG(struct ReadSeqVars *V)
255 {
256 /* 18Aug92: new IG format -- ^L between sequences in place of ";" */
257  char *si;
258 
259  while (!V->allDone) {
260  do {
261  GetLine(V);
262  for (si= V->s; *si != 0 && *si < ' '; si++) *si= ' '; /* drop controls */
263  if (*si == 0) *V->s= 0; /* chop line to empty */
264  } while (! (feof(V->f) || ((*V->s != 0) && (*V->s != ';') ) ));
265  if (feof(V->f))
266  V->allDone = true;
267  else {
268  strcpy(V->seqid, V->s);
269  readLoop(0, false, endIG, V);
270  }
271  }
272 }
273 
274 
275 
276 Local boolean endStrider( boolean *addend, boolean *ungetend, struct ReadSeqVars *V)
277 {
278  *addend = false;
279  *ungetend= false;
280  return (strstr( V->s, "//") != NULL);
281 }
282 
284 { /* ? only 1 seq/file ? */
285 
286  while (!V->allDone) {
287  GetLine(V);
288  if (strstr(V->s,"; DNA sequence ") == V->s)
289  strcpy(V->seqid, (V->s)+16);
290  else
291  strcpy(V->seqid, (V->s)+1);
292  while ((!feof(V->f)) && (*V->s == ';')) {
293  GetLine(V);
294  }
295  if (feof(V->f)) V->allDone = true;
296  else readLoop(0, true, endStrider, V);
297  }
298 }
299 
300 
301 Local boolean endPIR( boolean *addend, boolean *ungetend, struct ReadSeqVars *V)
302 {
303  *addend = false;
304  *ungetend= (strstr(V->s,"ENTRY") == V->s);
305  return ((strstr(V->s,"///") != NULL) || *ungetend);
306 }
307 
308 Local void readPIR(struct ReadSeqVars *V)
309 { /*PIR -- many seqs/file */
310 
311  while (!V->allDone) {
312  while (! (feof(V->f) || strstr(V->s,"ENTRY") || strstr(V->s,"SEQUENCE")) )
313  GetLine(V);
314  strcpy(V->seqid, (V->s)+16);
315  while (! (feof(V->f) || strstr(V->s,"SEQUENCE") == V->s))
316  GetLine(V);
317  readLoop(0, false, endPIR, V);
318 
319  if (!V->allDone) {
320  while (! (feof(V->f) || ((*V->s != 0)
321  && (strstr( V->s,"ENTRY") == V->s))))
322  GetLine(V);
323  }
324  if (feof(V->f)) V->allDone = true;
325  }
326 }
327 
328 
329 Local boolean endGB( boolean *addend, boolean *ungetend, struct ReadSeqVars *V)
330 {
331  *addend = false;
332  *ungetend= (strstr(V->s,"LOCUS") == V->s);
333  return ((strstr(V->s,"//") != NULL) || *ungetend);
334 }
335 
336 Local void readGenBank(struct ReadSeqVars * V)
337 { /* GenBank -- many seqs/file */
338 
339  V->isseqchar = isSeqNumChar;
341 
342  while (!V->allDone) {
343  strcpy(V->seqid, (V->s)+12);
344  while (! (feof(V->f) || strstr(V->s,"ORIGIN") == V->s))
345  GetLine(V);
346  readLoop(0, false, endGB, V);
347 
348  if (!V->allDone) {
349  while (! (feof(V->f) || ((*V->s != 0)
350  && (strstr( V->s, "LOCUS") == V->s))))
351  GetLine(V);
352  }
353  if (feof(V->f)) V->allDone = true;
354  }
355 
356  V->isseqchar = isSeqChar;
357  V->isseqcharfirst8 = 0;
358 }
359 
360 
361 Local boolean endNBRF( boolean *addend, boolean *ungetend, struct ReadSeqVars *V)
362 {
363  char *a;
364 
365  if ((a = strchr(V->s, '*')) != NULL) { /* end of 1st seq */
366  /* "*" can be valid base symbol, drop it here */
367  *a = 0;
368  *addend = true;
369  *ungetend= false;
370  return(true);
371  }
372  else if (*V->s == '>') { /* start of next seq */
373  *addend = false;
374  *ungetend= true;
375  return(true);
376  }
377  else
378  return(false);
379 }
380 
381 Local void readNBRF(struct ReadSeqVars *V)
382 {
383  while (!V->allDone) {
384  strcpy(V->seqid, (V->s)+4);
385  GetLine(V); /*skip title-junk line*/
386  readLoop(0, false, endNBRF, V);
387  if (!V->allDone) {
388  while (!(feof(V->f) || (*V->s != 0 && *V->s == '>')))
389  GetLine(V);
390  }
391  if (feof(V->f)) V->allDone = true;
392  }
393 }
394 
395 
396 
397 Local boolean endPearson( boolean *addend, boolean *ungetend, struct ReadSeqVars *V)
398 {
399  *addend = false;
400  *ungetend= true;
401  return(*V->s == '>');
402 }
403 
405 {
406  while (!V->allDone) {
407  strcpy(V->seqid, (V->s)+1);
408  readLoop(0, false, endPearson, V);
409  if (!V->allDone) {
410  while (!(feof(V->f) || ((*V->s != 0) && (*V->s == '>'))))
411  GetLine(V);
412  }
413  if (feof(V->f)) V->allDone = true;
414  }
415 }
416 
417 
418 
419 Local boolean endEMBL( boolean *addend, boolean *ungetend, struct ReadSeqVars *V)
420 {
421  *addend = false;
422  *ungetend= (strstr(V->s,"ID ") == V->s);
423  return ((strstr(V->s,"//") != NULL) || *ungetend);
424 }
425 
426 Local void readEMBL(struct ReadSeqVars *V)
427 {
428  while (!V->allDone) {
429  strcpy(V->seqid, (V->s)+5);
430  do {
431  GetLine(V);
432  } while (!(feof(V->f) | (strstr(V->s,"SQ ") == V->s)));
433 
434  readLoop(0, false, endEMBL, V);
435  if (!V->allDone) {
436  while (!(feof(V->f) |
437  ((*V->s != '\0') & (strstr(V->s,"ID ") == V->s))))
438  GetLine(V);
439  }
440  if (feof(V->f)) V->allDone = true;
441  }
442 }
443 
444 
445 
446 Local boolean endZuker( boolean *addend, boolean *ungetend, struct ReadSeqVars *V)
447 {
448  *addend = false;
449  *ungetend= true;
450  return( *V->s == '(' );
451 }
452 
453 Local void readZuker(struct ReadSeqVars *V)
454 {
457  while (!V->allDone) {
458  GetLine(V); /*s == "seqLen seqid string..."*/
459  strcpy(V->seqid, (V->s)+6);
460  readLoop(0, false, endZuker, V);
461  if (!V->allDone) {
462  while (!(feof(V->f) |
463  ((*V->s != '\0') & (*V->s == '('))))
464  GetLine(V);
465  }
466  if (feof(V->f)) V->allDone = true;
467  }
468 }
469 
470 
471 
472 Local boolean endFitch( boolean *addend, boolean *ungetend, struct ReadSeqVars *V)
473 {
474  /* this is a somewhat shaky end,
475  1st char of line is non-blank for seq. title
476  */
477  *addend = false;
478  *ungetend= true;
479  return( *V->s != ' ' );
480 }
481 
482 Local void readFitch(struct ReadSeqVars *V)
483 {
484  boolean first;
485 
486  first = true;
487  while (!V->allDone) {
488  if (!first) strcpy(V->seqid, V->s);
489  readLoop(0, first, endFitch, V);
490  if (feof(V->f)) V->allDone = true;
491  first = false;
492  }
493 }
494 
495 
496 Local void readPlain(struct ReadSeqVars *V)
497 {
498  V->nseq++;
499  V->addit = (V->choice > 0);
500  if (V->addit) V->seqlen = 0;
501  addseq(V->seqid, V); /*from above..*/
502  if (V->fname!=NULL) sprintf(V->seqid, "%s [Unknown form]", V->fname);
503  else sprintf(V->seqid, " [Unknown form]");
504  do {
505  addseq(V->s, V);
506  V->done = feof(V->f);
507  GetLine(V);
508  } while (!V->done);
509  if (V->choice == kListSequences) addinfo(V->seqid, V);
510  V->allDone = true;
511 }
512 
513 
514 Local void readUWGCG(struct ReadSeqVars *V)
515 {
516 /*
517 10nov91: Reading GCG files casued duplication of last line when
518  EOF followed that line !!!
519  fix: GetLine now sets *V->s = 0
520 */
521  char *si;
522 
523  V->nseq++;
524  V->addit = (V->choice > 0);
525  if (V->addit) V->seqlen = 0;
526  strcpy(V->seqid, V->s);
527  /*writeseq: " %s Length: %d (today) Check: %d ..\n" */
528  /*drop above or ".." from id*/
529  if ((si = strstr(V->seqid," Length: "))) *si = 0;
530  else if ((si = strstr(V->seqid,".."))) *si = 0;
531  do {
532  V->done = feof(V->f);
533  GetLine(V);
534  if (!V->done) addseq((V->s), V);
535  } while (!V->done);
536  if (V->choice == kListSequences) addinfo(V->seqid, V);
537  V->allDone = true;
538 }
539 
540 
541 Local void readOlsen(struct ReadSeqVars *V)
542 { /* G. Olsen /print output from multiple sequence editor */
543 
544  char *si, *sj, *sk, *sm, sid[40], snum[20];
545  boolean indata = false;
546  int snumlen;
547 
548  V->addit = (V->choice > 0);
549  if (V->addit) V->seqlen = 0;
550  rewind(V->f); V->nseq= 0;
551  do {
552  GetLine(V);
553  V->done = feof(V->f);
554 
555  if (V->done && !(*V->s)) break;
556  else if (indata) {
557  if ( (si= strstr(V->s, sid))
558  /* && (strstr(V->s, snum) == si - snumlen - 1) ) { */
559  && (sm= strstr(V->s, snum)) && (sm < si - snumlen) ) {
560 
561  /* Spaces are valid alignment data !! */
562 /* 17Oct91: Error, the left margin is 21 not 22! */
563 /* dropped some nucs up to now -- my example file was right shifted ! */
564 /* variable right id margin, drop id-2 spaces at end */
565 /*
566  VMS CC COMPILER (VAXC031) mess up:
567  -- Index of 21 is chopping 1st nuc on VMS systems Only!
568  Byte-for-byte same ame rnasep.olsen sequence file !
569 */
570 
571  /* si = (V->s)+21; < was this before VMS CC wasted my time */
572  si += 10; /* use strstr index plus offset to outfox VMS CC bug */
573 
574  if ((sk = strstr(si, sid))) *(sk-2) = 0;
575  for (sk = si; *sk != 0; sk++) {
576  if (*sk == ' ') *sk = '.';
577  /* 18aug92: !! some olsen masks are NUMBERS !! which addseq eats */
578  else if (isdigit(*sk)) *sk= nonummask[*sk - '0'];
579  }
580 
581  addseq(si, V);
582  }
583  }
584 
585  else if ((sk = strstr(V->s, "): "))) { /* seq info header line */
586  /* 18aug92: correct for diff seqs w/ same name -- use number, e.g. */
587  /* 3 (Agr.tume): agrobacterium.prna 18-JUN-1987 16:12 */
588  /* 328 (Agr.tume): agrobacterium.prna XYZ 19-DEC-1992 */
589  (V->nseq)++;
590  si = 1 + strchr(V->s,'(');
591  *sk = ' ';
592  if (V->choice == kListSequences) addinfo( si, V);
593  else if (V->nseq == V->choice) {
594  strcpy(V->seqid, si);
595  sj = strchr(V->seqid, ':');
596  while (*(--sj) == ' ') ;
597  while (--sj != V->seqid) { if (*sj == ' ') *sj = '_'; }
598 
599  *sk = 0;
600  while (*(--sk) == ' ') *sk = 0;
601  strcpy(sid, si);
602 
603  si= V->s;
604  while ((*si <= ' ') && (*si != 0)) si++;
605  snumlen=0;
606  while (si[snumlen] > ' ' && snumlen<20)
607  { snum[snumlen]= si[snumlen]; snumlen++; }
608  snum[snumlen]= 0;
609  }
610 
611  }
612 
613  else if (strstr(V->s,"identity: Data:")) {
614  indata = true;
615  if (V->choice == kListSequences) V->done = true;
616  }
617 
618  } while (!V->done);
619 
620  V->allDone = true;
621 } /*readOlsen*/
622 
623 
624 Local void readMSF(struct ReadSeqVars *V)
625 { /* gcg's MSF, mult. sequence format, interleaved ! */
626 
627  char *si, *sj, sid[128];
628  boolean indata = false;
629  int iline= 0;
630 
631  V->addit = (V->choice > 0);
632  if (V->addit) V->seqlen = 0;
633  rewind(V->f); V->nseq= 0;
634  do {
635  GetLine(V);
636  V->done = feof(V->f);
637 
638  if (V->done && !(*V->s)) break;
639  else if (indata) {
640  /*somename ...gpvedai .......t.. aaigr..vad tvgtgptnse aipaltaaet */
641  /* E gvenae.kgv tentna.tad fvaqpvylpe .nqt...... kv.affynrs */
642 
643  si= V->s;
644  skipwhitespace(si);
645  /* for (sj= si; isalnum(*sj); sj++) ; bug -- cdelwiche uses "-", "_" and others in names*/
646  for (sj= si; *sj > ' '; sj++) ;
647  *sj= 0;
648  if ( *si ) {
649  if ( (0==strcmp(si, sid)) ) {
650  addseq(sj+1, V);
651  }
652  iline++;
653  }
654  }
655 
656  else if (NULL != (si = strstr(V->s, "Name: "))) { /* seq info header line */
657  /* Name: somename Len: 100 Check: 7009 Weight: 1.00 */
658 
659  (V->nseq)++;
660  si += 6;
661  if (V->choice == kListSequences) addinfo( si, V);
662  else if (V->nseq == V->choice) {
663  strcpy(V->seqid, si);
664  si = V->seqid;
665  skipwhitespace(si);
666  /* for (sj= si; isalnum(*sj); sj++) ; -- bug */
667  for (sj= si; *sj > ' '; sj++) ;
668  *sj= 0;
669  strcpy(sid, si);
670  }
671  }
672 
673  else if ( strstr(V->s,"//") /*== V->s*/ ) {
674  indata = true;
675  iline= 0;
676  if (V->choice == kListSequences) V->done = true;
677  }
678 
679  } while (!V->done);
680 
681 
682  V->allDone = true;
683 } /*readMSF*/
684 
685 
686 
688 { /* PAUP mult. sequence format, interleaved or sequential! */
689 
690  char *si, *sj, *send, sid[40], sid1[40], saveseq[255];
691  boolean first = true, indata = false, domatch;
692  int iline= 0, ifmc, saveseqlen=0;
693 
694 #define fixmatchchar(s) { \
695  for (ifmc=0; ifmc<saveseqlen; ifmc++) \
696  if (s[ifmc] == V->matchchar) s[ifmc]= saveseq[ifmc]; }
697 
698  V->addit = (V->choice > 0);
699  V->seqlencount = 0;
700  if (V->addit) V->seqlen = 0;
701  /* rewind(V->f); V->nseq= 0; << do in caller !*/
702  indata= true; /* call here after we find "matrix" */
703  domatch= (V->matchchar > 0);
704 
705  do {
706  GetLine(V);
707  V->done = feof(V->f);
708 
709  if (V->done && !(*V->s)) break;
710  else if (indata) {
711  /* [ 1 1 1 ]*/
712  /* human aagcttcaccggcgcagtca ttctcataatcgcccacggR cttacatcct*/
713  /* chimp ................a.t. .c.................a ..........*/
714  /* !! need to correct for V->matchchar */
715  si= V->s;
716  skipwhitespace(si);
717  if (strchr(si,';')) indata= false;
718 
719  if (isalnum(*si)) {
720  /* valid data line starts w/ a left-justified seq name in columns [0..8] */
721  if (first) {
722  (V->nseq)++;
723  if (V->nseq >= V->topnseq) first= false;
724  for (sj = si; isalnum(*sj); sj++) ;
725  send= sj;
726  skipwhitespace(sj);
727  if (V->choice == kListSequences) {
728  *send= 0;
729  addinfo( si, V);
730  }
731  else if (V->nseq == V->choice) {
732  if (domatch) {
733  if (V->nseq == 1) { strcpy( saveseq, sj); saveseqlen= strlen(saveseq); }
734  else fixmatchchar( sj);
735  }
736  addseq(sj, V);
737  *send= 0;
738  strcpy(V->seqid, si);
739  strcpy(sid, si);
740  if (V->nseq == 1) strcpy(sid1, sid);
741  }
742  }
743 
744  else if ( (strstr(si, sid) == si) ){
745  while (isalnum(*si)) si++;
746  skipwhitespace(si);
747  if (domatch) {
748  if (V->nseq == 1) { strcpy( saveseq, si); saveseqlen= strlen(saveseq); }
749  else fixmatchchar( si);
750  }
751  addseq(si, V);
752  }
753 
754  else if (domatch && (strstr(si, sid1) == si)) {
755  strcpy( saveseq, si);
756  saveseqlen= strlen(saveseq);
757  }
758 
759  iline++;
760  }
761  }
762 
763  else if ( strstr(V->s,"matrix") ) {
764  indata = true;
765  iline= 0;
766  if (V->choice == kListSequences) V->done = true;
767  }
768 
769  } while (!V->done);
770 
771  V->allDone = true;
772 } /*readPAUPinterleaved*/
773 
774 
775 
777 { /* PAUP mult. sequence format, interleaved or sequential! */
778  char *si, *sj;
779  boolean atname = true, indata = false;
780 
781  V->addit = (V->choice > 0);
782  if (V->addit) V->seqlen = 0;
783  V->seqlencount = 0;
784  /* rewind(V->f); V->nseq= 0; << do in caller !*/
785  indata= true; /* call here after we find "matrix" */
786  do {
787  GetLine(V);
788  V->done = feof(V->f);
789 
790  if (V->done && !(*V->s)) break;
791  else if (indata) {
792  /* [ 1 1 1 ]*/
793  /* human aagcttcaccggcgcagtca ttctcataatcgcccacggR cttacatcct*/
794  /* aagcttcaccggcgcagtca ttctcataatcgcccacggR cttacatcct*/
795  /* chimp ................a.t. .c.................a ..........*/
796  /* ................a.t. .c.................a ..........*/
797 
798  si= V->s;
799  skipwhitespace(si);
800  if (strchr(si,';')) indata= false;
801  if (isalnum(*si)) {
802  /* valid data line starts w/ a left-justified seq name in columns [0..8] */
803  if (atname) {
804  (V->nseq)++;
805  V->seqlencount = 0;
806  atname= false;
807  sj= si+1;
808  while (isalnum(*sj)) sj++;
809  if (V->choice == kListSequences) {
810  /* !! we must count bases to know when topseqlen is reached ! */
811  countseq(sj, V);
812  if (V->seqlencount >= V->topseqlen) atname= true;
813  *sj= 0;
814  addinfo( si, V);
815  }
816  else if (V->nseq == V->choice) {
817  addseq(sj, V);
818  V->seqlencount= V->seqlen;
819  if (V->seqlencount >= V->topseqlen) atname= true;
820  *sj= 0;
821  strcpy(V->seqid, si);
822  }
823  else {
824  countseq(sj, V);
825  if (V->seqlencount >= V->topseqlen) atname= true;
826  }
827  }
828 
829  else if (V->nseq == V->choice) {
830  addseq(V->s, V);
831  V->seqlencount= V->seqlen;
832  if (V->seqlencount >= V->topseqlen) atname= true;
833  }
834  else {
835  countseq(V->s, V);
836  if (V->seqlencount >= V->topseqlen) atname= true;
837  }
838  }
839  }
840 
841  else if ( strstr(V->s,"matrix") ) {
842  indata = true;
843  atname= true;
844  if (V->choice == kListSequences) V->done = true;
845  }
846 
847  } while (!V->done);
848 
849  V->allDone = true;
850 } /*readPAUPsequential*/
851 
852 
854 {
855  char *si, *sj;
856  boolean first = true;
857  int iline= 0;
858 
859  V->addit = (V->choice > 0);
860  if (V->addit) V->seqlen = 0;
861  V->seqlencount = 0;
862  /* sscanf( V->s, "%d%d", &V->topnseq, &V->topseqlen); << topnseq == 0 !!! bad scan !! */
863  si= V->s;
864  skipwhitespace(si);
865  V->topnseq= atoi(si);
866  while (isdigit(*si)) si++;
867  skipwhitespace(si);
868  V->topseqlen= atol(si);
869  /* fprintf(stderr,"Phylip-ileaf: topnseq=%d topseqlen=%d\n",V->topnseq, V->topseqlen); */
870 
871  do {
872  GetLine(V);
873  V->done = feof(V->f);
874 
875  if (V->done && !(*V->s)) break;
876  si= V->s;
877  skipwhitespace(si);
878  if (*si != 0) {
879 
880  if (first) { /* collect seq names + seq, as fprintf(outf,"%-10s ",seqname); */
881  (V->nseq)++;
882  if (V->nseq >= V->topnseq) first= false;
883  sj= V->s+10; /* past name, start of data */
884  if (V->choice == kListSequences) {
885  *sj= 0;
886  addinfo( si, V);
887  }
888  else if (V->nseq == V->choice) {
889  addseq(sj, V);
890  *sj= 0;
891  strcpy(V->seqid, si);
892  }
893  }
894  else if ( iline % V->nseq == V->choice -1 ) {
895  addseq(si, V);
896  }
897  iline++;
898  }
899  } while (!V->done);
900 
901  V->allDone = true;
902 } /*readPhylipInterleaved*/
903 
904 
905 
906 Local boolean endPhylipSequential( boolean *addend, boolean *ungetend, struct ReadSeqVars *V)
907 {
908  *addend = false;
909  *ungetend= false;
910  countseq( V->s, V);
911  return V->seqlencount >= V->topseqlen;
912 }
913 
915 {
916  short i;
917  char *si;
918  /* sscanf( V->s, "%d%d", &V->topnseq, &V->topseqlen); < ? bad sscan ? */
919  si= V->s;
920  skipwhitespace(si);
921  V->topnseq= atoi(si);
922  while (isdigit(*si)) si++;
923  skipwhitespace(si);
924  V->topseqlen= atol(si);
925  GetLine(V);
926  while (!V->allDone) {
927  V->seqlencount= 0;
928  strncpy(V->seqid, (V->s), 10);
929  V->seqid[10]= 0;
930  for (i=0; i<10 && V->s[i]; i++) V->s[i]= ' ';
931  readLoop(0, true, endPhylipSequential, V);
932  if (feof(V->f)) V->allDone = true;
933  }
934 }
935 
936 
937 
938 
940  struct ReadSeqVars *V,
941  const long skiplines_,
942  const short format_)
943 {
944 #define tolowerstr(s) { long Itlwr, Ntlwr= strlen(s); \
945  for (Itlwr=0; Itlwr<Ntlwr; Itlwr++) s[Itlwr]= to_lower(s[Itlwr]); }
946 
947  boolean gotuw;
948  long l;
949 
950  V->linestart= 0;
951  V->matchchar= 0;
952  if (V->f == NULL)
953  V->err = eFileNotFound;
954  else {
955 
956  for (l = skiplines_; l > 0; l--) GetLine( V);
957 
958  do {
959  GetLine( V);
960  for (l= strlen(V->s); (l > 0) && (V->s[l] == ' '); l--) ;
961  } while ((l == 0) && !feof(V->f));
962 
963  if (feof(V->f)) V->err = eNoData;
964 
965  else switch (format_) {
966  case kPlain : readPlain(V); break;
967  case kIG : readIG(V); break;
968  case kStrider: readStrider(V); break;
969  case kGenBank: readGenBank(V); break;
970  case kPIR : readPIR(V); break;
971  case kNBRF : readNBRF(V); break;
972  case kPearson: readPearson(V); break;
973  case kEMBL : readEMBL(V); break;
974  case kZuker : readZuker(V); break;
975  case kOlsen : readOlsen(V); break;
976  case kMSF : readMSF(V); break;
977 
978  case kPAUP : {
979  boolean done= false;
980  boolean interleaved= false;
981  char *cp;
982  /* rewind(V->f); V->nseq= 0; ?? assume it is at top ?? skiplines ... */
983  while (!done) {
984  GetLine( V);
985  tolowerstr( V->s);
986  if (strstr( V->s, "matrix")) done= true;
987  if (strstr( V->s, "interleav")) interleaved= true;
988  if (NULL != (cp=strstr( V->s, "ntax=")) ) V->topnseq= atoi(cp+5);
989  if (NULL != (cp=strstr( V->s, "nchar=")) ) V->topseqlen= atoi(cp+6);
990  if (NULL != (cp=strstr( V->s, "matchchar=")) ) {
991  cp += 10;
992  if (*cp=='\'') cp++;
993  else if (*cp=='"') cp++;
994  V->matchchar= *cp;
995  }
996  }
997  if (interleaved) readPAUPinterleaved(V);
998  else readPAUPsequential(V);
999  }
1000  break;
1001 
1002  /* kPhylip: ! can't determine in middle of file which type it is...*/
1003  /* test for interleave or sequential and use Phylip4(ileave) or Phylip2 */
1004  case kPhylip2:
1006  break;
1007  case kPhylip4: /* == kPhylip3 */
1009  break;
1010 
1011  default:
1012  V->err = eUnknownFormat;
1013  break;
1014 
1015  case kFitch :
1016  strcpy(V->seqid, V->s); GetLine(V);
1017  readFitch(V);
1018  break;
1019 
1020  case kGCG:
1021  do {
1022  gotuw = (strstr(V->s,"..") != NULL);
1023  if (gotuw) readUWGCG(V);
1024  GetLine(V);
1025  } while (!(feof(V->f) || V->allDone));
1026  break;
1027  }
1028  }
1029 
1030  V->filestart= false;
1031  V->seq[V->seqlen] = 0; /* stick a string terminator on it */
1032 }
1033 
1034 
1036  const short whichEntry_, /* index to sequence in file */
1037  FILE *fp_, /* pointer to open seq file */
1038  const long skiplines_,
1039  const short format_, /* sequence file format */
1040  long *seqlen_, /* return seq size */
1041  short *nseq_, /* number of seqs in file, for listSeqs() */
1042  short *error_, /* return error */
1043  char *seqid_) /* return seq name/info */
1044 {
1045  struct ReadSeqVars V;
1046 
1047  if (format_ < kMinFormat || format_ > kMaxFormat) {
1048  *error_ = eUnknownFormat;
1049  *seqlen_ = 0;
1050  return NULL;
1051  }
1052 
1053  V.choice = whichEntry_;
1054  V.fname = NULL; /* don't know */
1055  V.seq = (char*) calloc(1, kStartLength+1);
1056  V.maxseq = kStartLength;
1057  V.seqlen = 0;
1058  V.seqid = seqid_;
1059 
1060  V.f = fp_;
1061  V.filestart= (ftell( fp_) == 0);
1062  /* !! in sequential read, must remove current seq position from choice/whichEntry_ counter !! ... */
1063  if (V.filestart) V.nseq = 0;
1064  else V.nseq= *nseq_; /* track where we are in file...*/
1065 
1066  *V.seqid = '\0';
1067  V.err = 0;
1068  V.nseq = 0;
1069  V.isseqchar = isSeqChar;
1070  V.isseqcharfirst8 = 0;
1071  if (V.choice == kListSequences) ; /* leave as is */
1072  else if (V.choice <= 0) V.choice = 1; /* default ?? */
1073  V.addit = (V.choice > 0);
1074  V.allDone = false;
1075 
1076  readSeqMain(&V, skiplines_, format_);
1077 
1078  *error_ = V.err;
1079  *seqlen_ = V.seqlen;
1080  *nseq_ = V.nseq;
1081  return V.seq;
1082 }
1083 
1084 char *readSeq(
1085  const short whichEntry_, /* index to sequence in file */
1086  const char *filename_, /* file name */
1087  const long skiplines_,
1088  const short format_, /* sequence file format */
1089  long *seqlen_, /* return seq size */
1090  short *nseq_, /* number of seqs in file, for listSeqs() */
1091  short *error_, /* return error */
1092  char *seqid_) /* return seq name/info */
1093 {
1094  struct ReadSeqVars V;
1095 
1096  if (format_ < kMinFormat || format_ > kMaxFormat) {
1097  *error_ = eUnknownFormat;
1098  *seqlen_ = 0;
1099  return NULL;
1100  }
1101 
1102  V.choice = whichEntry_;
1103  V.fname = filename_; /* don't need to copy string, just ptr to it */
1104  V.seq = (char*) calloc(1, kStartLength+1);
1105  V.maxseq = kStartLength;
1106  V.seqlen = 0;
1107  V.seqid = seqid_;
1108 
1109  V.f = NULL;
1110  *V.seqid = '\0';
1111  V.err = 0;
1112  V.nseq = 0;
1113  V.isseqchar = isSeqChar;
1114  V.isseqcharfirst8 = 0;
1115  if (V.choice == kListSequences) ; /* leave as is */
1116  else if (V.choice <= 0) V.choice = 1; /* default ?? */
1117  V.addit = (V.choice > 0);
1118  V.allDone = false;
1119 
1120  V.f = fopen(V.fname, "r");
1121  V.filestart= true;
1122 
1123  readSeqMain(&V, skiplines_, format_);
1124 
1125  if (V.f != NULL) fclose(V.f);
1126  *error_ = V.err;
1127  *seqlen_ = V.seqlen;
1128  *nseq_ = V.nseq;
1129  return V.seq;
1130 }
1131 
1132 
1133 
1134 
1135 
1136 char *listSeqs(
1137  const char *filename_, /* file name */
1138  const long skiplines_,
1139  const short format_, /* sequence file format */
1140  short *nseq_, /* number of seqs in file, for listSeqs() */
1141  short *error_) /* return error */
1142 {
1143  char seqid[256];
1144  long seqlen;
1145 
1146  return readSeq( kListSequences, filename_, skiplines_, format_,
1147  &seqlen, nseq_, error_, seqid);
1148 }
1149 
1150 
1151 
1152 
1153 short seqFileFormat( /* return sequence format number, see ureadseq.h */
1154  const char *filename,
1155  long *skiplines, /* return #lines to skip any junk like mail header */
1156  short *error) /* return any error value or 0 */
1157 {
1158  FILE *fseq;
1159  short format;
1160 
1161  fseq = fopen(filename, "r");
1162  format= seqFileFormatFp( fseq, skiplines, error);
1163  if (fseq!=NULL) fclose(fseq);
1164  return format;
1165 }
1166 
1168  FILE *fseq,
1169  long *skiplines, /* return #lines to skip any junk like mail header */
1170  short *error) /* return any error value or 0 */
1171 {
1172  boolean foundIG= false, foundStrider= false,
1173  foundGB= false, foundPIR= false, foundEMBL= false, foundNBRF= false,
1174  foundPearson= false, foundFitch= false, foundPhylip= false, foundZuker= false,
1175  gotolsen= false, gotpaup = false, gotasn1 = false, gotuw= false, gotMSF= false,
1176  isfitch= false, isphylip= false, done= false;
1177  short format= kUnknown;
1178  int nlines= 0, k, splen= 0, otherlines= 0, aminolines= 0, dnalines= 0;
1179  char sp[256];
1180  long linestart=0;
1181  int maxlines2check=500;
1182 
1183 #define ReadOneLine(sp) \
1184  { done |= (feof(fseq)); \
1185  readline( fseq, sp, &linestart); \
1186  if (!done) { splen = strlen(sp); ++nlines; } }
1187 
1188  *skiplines = 0;
1189  *error = 0;
1190  if (fseq == NULL) { *error = eFileNotFound; return kNoformat; }
1191 
1192  while ( !done ) {
1193  ReadOneLine(sp);
1194 
1195  /* check for mailer head & skip past if found */
1196  if (nlines < 4 && !done) {
1197  if ((strstr(sp,"From ") == sp) || (strstr(sp,"Received:") == sp)) {
1198  do {
1199  /* skip all lines until find one blank line */
1200  ReadOneLine(sp);
1201  if (!done) for (k=0; (k<splen) && (sp[k]==' '); k++) ;
1202  } while ((!done) && (k < splen));
1203  *skiplines = nlines; /* !? do we want #lines or #bytes ?? */
1204  }
1205  }
1206 
1207  if (sp==NULL || *sp==0)
1208  ; /* nada */
1209 
1210  /* high probability identities: */
1211 
1212  else if ( strstr(sp,"MSF:") && strstr(sp,"Type:") && strstr(sp,"Check:") )
1213  gotMSF= true;
1214 
1215  else if ((strstr(sp,"..") != NULL) && (strstr(sp,"Check:") != NULL))
1216  gotuw= true;
1217 
1218  else if (strstr(sp,"identity: Data:") != NULL)
1219  gotolsen= true;
1220 
1221  else if ( strstr(sp,"::=") &&
1222  (strstr(sp,"Bioseq") || /* Bioseq or Bioseq-set */
1223  strstr(sp,"Seq-entry") ||
1224  strstr(sp,"Seq-submit") ) ) /* can we read submit format? */
1225  gotasn1= true;
1226 
1227  else if ( strstr(sp,"#NEXUS") == sp )
1228  gotpaup= true;
1229 
1230  /* uncertain identities: */
1231 
1232  else if (*sp ==';') {
1233  if (strstr(sp,"Strider") !=NULL) foundStrider= true;
1234  else foundIG= true;
1235  }
1236 
1237  else if (strstr(sp,"LOCUS") == sp)
1238  foundGB= true;
1239  else if (strstr(sp,"ORIGIN") == sp)
1240  foundGB= true;
1241 
1242  else if (strstr(sp,"ENTRY ") == sp) /* ? also (strcmp(sp,"\\\\\\")==0) */
1243  foundPIR= true;
1244  else if (strstr(sp,"SEQUENCE") == sp)
1245  foundPIR= true;
1246 
1247  else if (*sp == '>') {
1248  if (sp[3] == ';') foundNBRF= true;
1249  else foundPearson= true;
1250  }
1251 
1252  else if (strstr(sp,"ID ") == sp)
1253  foundEMBL= true;
1254  else if (strstr(sp,"SQ ") == sp)
1255  foundEMBL= true;
1256 
1257  else if (*sp == '(')
1258  foundZuker= true;
1259 
1260  else {
1261  if (nlines - *skiplines == 1) {
1262  int ispp= 0, ilen= 0;
1263  sscanf( sp, "%d%d", &ispp, &ilen);
1264  if (ispp > 0 && ilen > 0) isphylip= true;
1265  }
1266  else if (isphylip && nlines - *skiplines == 2) {
1267  int tseq;
1268  tseq= getseqtype(sp+10, strlen(sp+10));
1269  if ( isalpha(*sp) /* 1st letter in 2nd line must be of a name */
1270  && (tseq != kOtherSeq)) /* sequence section must be okay */
1271  foundPhylip= true;
1272  }
1273 
1274  for (k=0, isfitch= true; isfitch & (k < splen); k++) {
1275  if (k % 4 == 0) isfitch &= (sp[k] == ' ');
1276  else isfitch &= (sp[k] != ' ');
1277  }
1278  if (isfitch & (splen > 20)) foundFitch= true;
1279 
1280  /* kRNA && kDNA are fairly certain...*/
1281  switch (getseqtype( sp, splen)) {
1282  case kOtherSeq: otherlines++; break;
1283  case kAmino : if (splen>20) aminolines++; break;
1284  case kDNA :
1285  case kRNA : if (splen>20) dnalines++; break;
1286  case kNucleic : break; /* not much info ? */
1287  }
1288 
1289  }
1290 
1291  /* pretty certain */
1292  if (gotolsen) {
1293  format= kOlsen;
1294  done= true;
1295  }
1296  else if (gotMSF) {
1297  format= kMSF;
1298  done= true;
1299  }
1300  else if (gotasn1) {
1301  /* !! we need to look further and return kASNseqentry | kASNseqset */
1302  /*
1303  seqentry key is Seq-entry ::=
1304  seqset key is Bioseq-set ::=
1305  ?? can't read these yet w/ ncbi tools ??
1306  Seq-submit ::=
1307  Bioseq ::= << fails both bioseq-seq and seq-entry parsers !
1308  */
1309  if (strstr(sp,"Bioseq-set")) format= kASNseqset;
1310  else if (strstr(sp,"Seq-entry")) format= kASNseqentry;
1311  else format= kASN1; /* other form, we can't yet read... */
1312  done= true;
1313  }
1314  else if (gotpaup) {
1315  format= kPAUP;
1316  done= true;
1317  }
1318 
1319  else if (gotuw) {
1320  if (foundIG) format= kIG; /* a TOIG file from GCG for certain */
1321  else format= kGCG;
1322  done= true;
1323  }
1324 
1325  else if ((dnalines > 1) || done || (nlines > maxlines2check)) {
1326  /* decide on most likely format */
1327  /* multichar idents: */
1328  if (foundStrider) format= kStrider;
1329  else if (foundGB) format= kGenBank;
1330  else if (foundPIR) format= kPIR;
1331  else if (foundEMBL) format= kEMBL;
1332  else if (foundNBRF) format= kNBRF;
1333  /* single char idents: */
1334  else if (foundIG) format= kIG;
1335  else if (foundPearson) format= kPearson;
1336  else if (foundZuker) format= kZuker;
1337  /* digit ident: */
1338  else if (foundPhylip) format= kPhylip;
1339  /* spacing ident: */
1340  else if (foundFitch) format= kFitch;
1341  /* no format chars: */
1342  else if (otherlines > 0) format= kUnknown;
1343  else if (dnalines > 1) format= kPlain;
1344  else if (aminolines > 1) format= kPlain;
1345  else format= kUnknown;
1346 
1347  done= true;
1348  }
1349 
1350  /* need this for possible long header in olsen format */
1351  else if (strstr(sp,"): ") != NULL)
1352  maxlines2check++;
1353  }
1354 
1355  if (format == kPhylip) {
1356  /* check for interleaved or sequential -- really messy */
1357  int tname, tseq;
1358  long i, j, nspp= 0, nlen= 0, ilen, leaf= 0, seq= 0;
1359  char *ps;
1360 
1361  rewind(fseq);
1362  for (i=0; i < *skiplines; i++) ReadOneLine(sp);
1363  nlines= 0;
1364  ReadOneLine(sp);
1365  sscanf( sp, "%ld%ld", &nspp, &nlen);
1366  ReadOneLine(sp); /* 1st seq line */
1367  for (ps= sp+10, ilen=0; *ps!=0; ps++) if (isprint(*ps)) ilen++;
1368 
1369  for (i= 1; i<nspp; i++) {
1370  ReadOneLine(sp);
1371 
1372  tseq= getseqtype(sp+10, strlen(sp+10));
1373  tname= getseqtype(sp, 10);
1374  for (j=0, ps= sp; isspace(*ps) && j<10; ps++, j++);
1375  for (ps= sp; *ps!=0; ps++) if (isprint(*ps)) ilen++;
1376 
1377  /* find probable interleaf or sequential ... */
1378  if (j>=9) seq += 10; /* pretty certain not ileaf */
1379  else {
1380  if (tseq != tname) leaf++; else seq++;
1381  if (tname == kDNA || tname == kRNA) seq++; else leaf++;
1382  }
1383 
1384  if (ilen <= nlen && j<9) {
1385  if (tname == kOtherSeq) leaf += 10;
1386  else if (tname == kAmino || tname == kDNA || tname == kRNA) seq++; else leaf++;
1387  }
1388  else if (ilen > nlen) {
1389  ilen= 0;
1390  }
1391  }
1392  for ( nspp *= 2 ; i<nspp; i++) { /* this should be only bases if interleaf */
1393  ReadOneLine(sp);
1394 
1395  tseq= getseqtype(sp+10, strlen(sp+10));
1396  tname= getseqtype(sp, 10);
1397  for (ps= sp; *ps!=0; ps++) if (isprint(*ps)) ilen++;
1398  for (j=0, ps= sp; isspace(*ps) && j<10; ps++, j++);
1399  if (j<9) {
1400  if (tname == kOtherSeq) seq += 10;
1401  if (tseq != tname) seq++; else leaf++;
1402  if (tname == kDNA || tname == kRNA) leaf++; else seq++;
1403  }
1404  if (ilen > nlen) {
1405  if (j>9) leaf += 10; /* must be a name here for sequent */
1406  else if (tname == kOtherSeq) seq += 10;
1407  ilen= 0;
1408  }
1409  }
1410 
1411  if (leaf > seq) format= kPhylip4;
1412  else format= kPhylip2;
1413  }
1414 
1415  return(format);
1416 #undef ReadOneLine
1417 } /* SeqFileFormat */
1418 
1419 
1420 
1421 
1422 unsigned long GCGchecksum( const char *seq, const long seqlen, unsigned long *checktotal)
1423 /* GCGchecksum */
1424 {
1425  long i, check = 0, count = 0;
1426 
1427  for (i = 0; i < seqlen; i++) {
1428  count++;
1429  check += count * to_upper(seq[i]);
1430  if (count == 57) count = 0;
1431  }
1432  check %= 10000;
1433  *checktotal += check;
1434  *checktotal %= 10000;
1435  return check;
1436 }
1437 
1438 /* Table of CRC-32's of all single byte values (made by makecrc.c of ZIP source) */
1439 const unsigned long crctab[] = {
1440  0x00000000L, 0x77073096L, 0xee0e612cL, 0x990951baL, 0x076dc419L,
1441  0x706af48fL, 0xe963a535L, 0x9e6495a3L, 0x0edb8832L, 0x79dcb8a4L,
1442  0xe0d5e91eL, 0x97d2d988L, 0x09b64c2bL, 0x7eb17cbdL, 0xe7b82d07L,
1443  0x90bf1d91L, 0x1db71064L, 0x6ab020f2L, 0xf3b97148L, 0x84be41deL,
1444  0x1adad47dL, 0x6ddde4ebL, 0xf4d4b551L, 0x83d385c7L, 0x136c9856L,
1445  0x646ba8c0L, 0xfd62f97aL, 0x8a65c9ecL, 0x14015c4fL, 0x63066cd9L,
1446  0xfa0f3d63L, 0x8d080df5L, 0x3b6e20c8L, 0x4c69105eL, 0xd56041e4L,
1447  0xa2677172L, 0x3c03e4d1L, 0x4b04d447L, 0xd20d85fdL, 0xa50ab56bL,
1448  0x35b5a8faL, 0x42b2986cL, 0xdbbbc9d6L, 0xacbcf940L, 0x32d86ce3L,
1449  0x45df5c75L, 0xdcd60dcfL, 0xabd13d59L, 0x26d930acL, 0x51de003aL,
1450  0xc8d75180L, 0xbfd06116L, 0x21b4f4b5L, 0x56b3c423L, 0xcfba9599L,
1451  0xb8bda50fL, 0x2802b89eL, 0x5f058808L, 0xc60cd9b2L, 0xb10be924L,
1452  0x2f6f7c87L, 0x58684c11L, 0xc1611dabL, 0xb6662d3dL, 0x76dc4190L,
1453  0x01db7106L, 0x98d220bcL, 0xefd5102aL, 0x71b18589L, 0x06b6b51fL,
1454  0x9fbfe4a5L, 0xe8b8d433L, 0x7807c9a2L, 0x0f00f934L, 0x9609a88eL,
1455  0xe10e9818L, 0x7f6a0dbbL, 0x086d3d2dL, 0x91646c97L, 0xe6635c01L,
1456  0x6b6b51f4L, 0x1c6c6162L, 0x856530d8L, 0xf262004eL, 0x6c0695edL,
1457  0x1b01a57bL, 0x8208f4c1L, 0xf50fc457L, 0x65b0d9c6L, 0x12b7e950L,
1458  0x8bbeb8eaL, 0xfcb9887cL, 0x62dd1ddfL, 0x15da2d49L, 0x8cd37cf3L,
1459  0xfbd44c65L, 0x4db26158L, 0x3ab551ceL, 0xa3bc0074L, 0xd4bb30e2L,
1460  0x4adfa541L, 0x3dd895d7L, 0xa4d1c46dL, 0xd3d6f4fbL, 0x4369e96aL,
1461  0x346ed9fcL, 0xad678846L, 0xda60b8d0L, 0x44042d73L, 0x33031de5L,
1462  0xaa0a4c5fL, 0xdd0d7cc9L, 0x5005713cL, 0x270241aaL, 0xbe0b1010L,
1463  0xc90c2086L, 0x5768b525L, 0x206f85b3L, 0xb966d409L, 0xce61e49fL,
1464  0x5edef90eL, 0x29d9c998L, 0xb0d09822L, 0xc7d7a8b4L, 0x59b33d17L,
1465  0x2eb40d81L, 0xb7bd5c3bL, 0xc0ba6cadL, 0xedb88320L, 0x9abfb3b6L,
1466  0x03b6e20cL, 0x74b1d29aL, 0xead54739L, 0x9dd277afL, 0x04db2615L,
1467  0x73dc1683L, 0xe3630b12L, 0x94643b84L, 0x0d6d6a3eL, 0x7a6a5aa8L,
1468  0xe40ecf0bL, 0x9309ff9dL, 0x0a00ae27L, 0x7d079eb1L, 0xf00f9344L,
1469  0x8708a3d2L, 0x1e01f268L, 0x6906c2feL, 0xf762575dL, 0x806567cbL,
1470  0x196c3671L, 0x6e6b06e7L, 0xfed41b76L, 0x89d32be0L, 0x10da7a5aL,
1471  0x67dd4accL, 0xf9b9df6fL, 0x8ebeeff9L, 0x17b7be43L, 0x60b08ed5L,
1472  0xd6d6a3e8L, 0xa1d1937eL, 0x38d8c2c4L, 0x4fdff252L, 0xd1bb67f1L,
1473  0xa6bc5767L, 0x3fb506ddL, 0x48b2364bL, 0xd80d2bdaL, 0xaf0a1b4cL,
1474  0x36034af6L, 0x41047a60L, 0xdf60efc3L, 0xa867df55L, 0x316e8eefL,
1475  0x4669be79L, 0xcb61b38cL, 0xbc66831aL, 0x256fd2a0L, 0x5268e236L,
1476  0xcc0c7795L, 0xbb0b4703L, 0x220216b9L, 0x5505262fL, 0xc5ba3bbeL,
1477  0xb2bd0b28L, 0x2bb45a92L, 0x5cb36a04L, 0xc2d7ffa7L, 0xb5d0cf31L,
1478  0x2cd99e8bL, 0x5bdeae1dL, 0x9b64c2b0L, 0xec63f226L, 0x756aa39cL,
1479  0x026d930aL, 0x9c0906a9L, 0xeb0e363fL, 0x72076785L, 0x05005713L,
1480  0x95bf4a82L, 0xe2b87a14L, 0x7bb12baeL, 0x0cb61b38L, 0x92d28e9bL,
1481  0xe5d5be0dL, 0x7cdcefb7L, 0x0bdbdf21L, 0x86d3d2d4L, 0xf1d4e242L,
1482  0x68ddb3f8L, 0x1fda836eL, 0x81be16cdL, 0xf6b9265bL, 0x6fb077e1L,
1483  0x18b74777L, 0x88085ae6L, 0xff0f6a70L, 0x66063bcaL, 0x11010b5cL,
1484  0x8f659effL, 0xf862ae69L, 0x616bffd3L, 0x166ccf45L, 0xa00ae278L,
1485  0xd70dd2eeL, 0x4e048354L, 0x3903b3c2L, 0xa7672661L, 0xd06016f7L,
1486  0x4969474dL, 0x3e6e77dbL, 0xaed16a4aL, 0xd9d65adcL, 0x40df0b66L,
1487  0x37d83bf0L, 0xa9bcae53L, 0xdebb9ec5L, 0x47b2cf7fL, 0x30b5ffe9L,
1488  0xbdbdf21cL, 0xcabac28aL, 0x53b39330L, 0x24b4a3a6L, 0xbad03605L,
1489  0xcdd70693L, 0x54de5729L, 0x23d967bfL, 0xb3667a2eL, 0xc4614ab8L,
1490  0x5d681b02L, 0x2a6f2b94L, 0xb40bbe37L, 0xc30c8ea1L, 0x5a05df1bL,
1491  0x2d02ef8dL
1492 };
1493 
1494 unsigned long CRC32checksum(const char *seq, const long seqlen, unsigned long *checktotal)
1495 /*CRC32checksum: modified from CRC-32 algorithm found in ZIP compression source */
1496 {
1497  unsigned long c = 0xffffffffL;
1498  long n = seqlen;
1499 
1500  if (!seq) return 0;
1501  while (n--) {
1502  c = crctab[((int)c ^ (to_upper(*seq))) & 0xff] ^ (c >> 8);
1503  seq++; /* fixed aug'98 finally */
1504  }
1505  c= c ^ 0xffffffffL;
1506  *checktotal += c;
1507  return c;
1508 }
1509 
1510 
1511 
1512 
1513 short getseqtype( const char *seq, const long seqlen)
1514 { /* return sequence kind: kDNA, kRNA, kProtein, kOtherSeq, ??? */
1515  char c;
1516  short i, maxtest;
1517  short na = 0, aa = 0, po = 0, nt = 0, nu = 0, ns = 0, no = 0;
1518 
1519  maxtest = min(300, seqlen);
1520  for (i = 0; i < maxtest; i++) {
1521  c = to_upper(seq[i]);
1522  if (strchr(protonly, c)) po++;
1523  else if (strchr(primenuc,c)) {
1524  na++;
1525  if (c == 'T') nt++;
1526  else if (c == 'U') nu++;
1527  }
1528  else if (strchr(aminos,c)) aa++;
1529  else if (strchr(seqsymbols,c)) ns++;
1530  else if (isalpha(c)) no++;
1531  }
1532 
1533  if ((no > 0) || (po+aa+na == 0)) return kOtherSeq;
1534  /* ?? test for probability of kOtherSeq ?, e.g.,
1535  else if (po+aa+na / maxtest < 0.70) return kOtherSeq;
1536  */
1537  else if (po > 0) return kAmino;
1538  else if (aa == 0) {
1539  if (nu > nt) return kRNA;
1540  else return kDNA;
1541  }
1542  else if (na > aa) return kNucleic;
1543  else return kAmino;
1544 } /* getseqtype */
1545 
1546 
1547 char* compressSeq( const char gapc, const char *seq, const long seqlen, long *newlen)
1548 {
1549  char *a, *b;
1550  long i;
1551  char *newseq;
1552 
1553  *newlen= 0;
1554  if (!seq) return NULL;
1555  newseq = (char*) malloc(seqlen+1);
1556  if (!newseq) return NULL;
1557  for (a= (char*)seq, b=newseq, i=0; *a!=0; a++)
1558  if (*a != gapc) {
1559  *b++= *a;
1560  i++;
1561  }
1562  *b= '\0';
1563  newseq = (char*) realloc(newseq, i+1);
1564  *newlen= i;
1565  return newseq;
1566 }
1567 
1568 
1569 
1570 /***
1571 char *rtfhead = "{\\rtf1\\defformat\\mac\\deff2 \
1572 {\\fonttbl\
1573  {\\f1\\fmodern Courier;}{\\f2\\fmodern Monaco;}\
1574  {\\f3\\fswiss Helvetica;}{\\f4\\fswiss Geneva;}\
1575  {\\f5\\froman Times;}{\\f6\\froman Palatino;}\
1576  {\\f7\\froman New Century Schlbk;}{\\f8\\ftech Symbol;}}\
1577 {\\stylesheet\
1578  {\\s1 \\f5\\fs20 \\sbasedon0\\snext1 name;}\
1579  {\\s2 \\f3\\fs20 \\sbasedon0\\snext2 num;}\
1580  {\\s3 \\f1\\f21 \\sbasedon0\\snext3 seq;}}";
1581 
1582 char *rtftail = "}";
1583 ****/
1584 
1585 short writeSeq(FILE *outf, const char *seq, const long seqlen,
1586  const short outform, const char *seqid)
1587 /* dump sequence to standard output */
1588 {
1589  const short kSpaceAll = -9;
1590 #define kMaxseqwidth 250
1591 
1592  boolean baseonlynum= false; /* nocountsymbols -- only count true bases, not "-" */
1593  short numline = 0; /* only true if we are writing seq number line (for interleave) */
1594  boolean numright = false, numleft = false;
1595  boolean nameright = false, nameleft = false;
1596  short namewidth = 8, numwidth = 8;
1597  short spacer = 0, width = 50, tab = 0;
1598  /* new parameters: width, spacer, those above... */
1599 
1600  short linesout = 0, seqtype = kNucleic;
1601  long i, j, l, l1, ibase;
1602  char idword[31], endstr[10];
1603  char seqnamestore[128], *seqname = seqnamestore;
1604  char s[kMaxseqwidth], *cp;
1605  char nameform[10], numform[10], nocountsymbols[10];
1606  unsigned long checksum = 0, checktotal = 0;
1607 
1608  gPretty.atseq++;
1609  skipwhitespace(seqid);
1610  l = min(128, strlen(seqid));
1611  strncpy( seqnamestore, seqid, l);
1612  seqname[l] = 0;
1613 
1614  sscanf( seqname, "%30s", idword);
1615  sprintf(numform, "%ld", seqlen);
1616  numwidth= strlen(numform)+1;
1617  nameform[0]= '\0';
1618 
1619  if (strstr(seqname,"checksum") != NULL) {
1620  cp = strstr(seqname,"bases");
1621  if (cp!=NULL) {
1622  for ( ; (cp!=seqname) && (*cp!=','); cp--) ;
1623  if (cp!=seqname) *cp=0;
1624  }
1625  }
1626 
1627  strcpy( endstr,"");
1628  l1 = 0;
1629 
1630  if (outform == kGCG || outform == kMSF)
1631  checksum = GCGchecksum(seq, seqlen, &checktotal);
1632  else
1633  checksum = seqchecksum(seq, seqlen, &checktotal);
1634 
1635  switch (outform) {
1636 
1637  case kPlain:
1638  case kUnknown: /* no header, just sequence */
1639  strcpy(endstr,"\n"); /* end w/ extra blank line */
1640  break;
1641 
1642  case kOlsen: /* Olsen seq. editor takes plain nucs OR Genbank */
1643  case kGenBank:
1644  fprintf(outf,"LOCUS %s %ld bp\n", idword, seqlen);
1645  fprintf(outf,"DEFINITION %s, %ld bases, %lX checksum.\n", seqname, seqlen, checksum);
1646  /* fprintf(outf,"ACCESSION %s\n", accnum); */
1647  fprintf(outf,"ORIGIN \n");
1648  spacer = 11;
1649  numleft = true;
1650  numwidth = 8; /* dgg. 1Feb93, patch for GDE fail to read short numwidth */
1651  strcpy(endstr, "\n//");
1652  linesout += 4;
1653  break;
1654 
1655  case kPIR:
1656  /* somewhat like genbank... \\\*/
1657  /* fprintf(outf,"\\\\\\\n"); << only at top of file, not each entry... */
1658  fprintf(outf,"ENTRY %s \n", idword);
1659  fprintf(outf,"TITLE %s, %ld bases, %lX checksum.\n", seqname, seqlen, checksum);
1660  /* fprintf(outf,"ACCESSION %s\n", accnum); */
1661  fprintf(outf,"SEQUENCE \n");
1662  numwidth = 7;
1663  width= 30;
1664  spacer = kSpaceAll;
1665  numleft = true;
1666  strcpy(endstr, "\n///");
1667  /* run a top number line for PIR */
1668  for (j=0; j<numwidth; j++) fputc(' ',outf);
1669  for (j= 5; j<=width; j += 5) fprintf(outf,"%10ld",j);
1670  fputc('\n',outf);
1671  linesout += 5;
1672  break;
1673 
1674  case kNBRF:
1675  if (getseqtype(seq, seqlen) == kAmino)
1676  fprintf(outf,">P1;%s\n", idword);
1677  else
1678  fprintf(outf,">DL;%s\n", idword);
1679  fprintf(outf,"%s, %ld bases, %lX checksum.\n", seqname, seqlen, checksum);
1680  spacer = 11;
1681  strcpy(endstr,"*\n");
1682  linesout += 3;
1683  break;
1684 
1685  case kEMBL:
1686  fprintf(outf,"ID %s\n", idword);
1687  /* fprintf(outf,"AC %s\n", accnum); */
1688  fprintf(outf,"DE %s, %ld bases, %lX checksum.\n", seqname, seqlen, checksum);
1689  fprintf(outf,"SQ %ld BP\n", seqlen);
1690  strcpy(endstr, "\n//"); /* 11Oct90: bug fix*/
1691  tab = 4;
1692  spacer = 11;
1693  width = 60;
1694  linesout += 4;
1695  break;
1696 
1697  case kGCG:
1698  fprintf(outf,"%s\n", seqname);
1699  /* fprintf(outf,"ACCESSION %s\n", accnum); */
1700  fprintf(outf," %s Length: %ld (today) Check: %ld ..\n", idword, seqlen, checksum);
1701  spacer = 11;
1702  numleft = true;
1703  strcpy(endstr, "\n"); /* this is insurance to help prevent misreads at eof */
1704  linesout += 3;
1705  break;
1706 
1707  case kStrider: /* ?? map ?*/
1708  fprintf(outf,"; ### from DNA Strider ;-)\n");
1709  fprintf(outf,"; DNA sequence %s, %ld bases, %lX checksum.\n;\n", seqname, seqlen, checksum);
1710  strcpy(endstr, "\n//");
1711  linesout += 3;
1712  break;
1713 
1714  case kFitch:
1715  fprintf(outf,"%s, %ld bases, %lX checksum.\n", seqname, seqlen, checksum);
1716  spacer = 4;
1717  width = 60;
1718  linesout += 1;
1719  break;
1720 
1721  case kPhylip2:
1722  case kPhylip4:
1723  /* this is version 3.2/3.4 -- simplest way to write
1724  version 3.3 is to write as version 3.2, then
1725  re-read file and interleave the species lines */
1726  if (strlen(idword)>10) idword[10] = 0;
1727  fprintf(outf,"%-10s ",idword);
1728  l1 = -1;
1729  tab = 12;
1730  spacer = 11;
1731  break;
1732 
1733  case kASN1:
1734  seqtype= getseqtype(seq, seqlen);
1735  switch (seqtype) {
1736  case kDNA : cp= (char*)"dna"; break;
1737  case kRNA : cp= (char*)"rna"; break;
1738  case kNucleic : cp= (char*)"na"; break;
1739  case kAmino : cp= (char*)"aa"; break;
1740  case kOtherSeq: cp= (char*)"not-set"; break;
1741  }
1742  fprintf(outf," seq {\n");
1743  fprintf(outf," id { local id %d },\n", gPretty.atseq);
1744  fprintf(outf," descr { title \"%s\" },\n", seqid);
1745  fprintf(outf," inst {\n");
1746  fprintf(outf," repr raw, mol %s, length %ld, topology linear,\n", cp, seqlen);
1747  fprintf(outf," seq-data\n");
1748  if (seqtype == kAmino)
1749  fprintf(outf," iupacaa \"");
1750  else
1751  fprintf(outf," iupacna \"");
1752  l1 = 17;
1753  spacer = 0;
1754  width = 78;
1755  tab = 0;
1756  strcpy(endstr,"\"\n } } ,");
1757  linesout += 7;
1758  break;
1759 
1760  case kPAUP:
1761  nameleft= true;
1762  namewidth = 9;
1763  spacer = 21;
1764  width = 100;
1765  tab = 0; /* 1; */
1766  /* strcpy(endstr,";\nend;"); << this is end of all seqs.. */
1767  /* do a header comment line for paup */
1768  fprintf(outf,"[Name: %-16s Len:%6ld Check: %8lX]\n", idword, seqlen, checksum);
1769  linesout += 1;
1770  break;
1771 
1772  case kPretty:
1773  numline= gPretty.numline;
1774  baseonlynum= gPretty.baseonlynum;
1775  namewidth = gPretty.namewidth;
1776  numright = gPretty.numright;
1777  numleft = gPretty.numleft;
1778  nameright = gPretty.nameright;
1779  nameleft = gPretty.nameleft;
1780  spacer = gPretty.spacer + 1;
1781  width = gPretty.seqwidth;
1782  tab = gPretty.tab;
1783  /* also add rtf formatting w/ font, size, style */
1784  if (gPretty.nametop) {
1785  fprintf(outf,"Name: %-16s Len:%6ld Check: %8lX\n", idword, seqlen, checksum);
1786  linesout++;
1787  }
1788  break;
1789 
1790  case kMSF:
1791  fprintf(outf," Name: %-16s Len:%6ld Check: %5ld Weight: 1.00\n",
1792  idword, seqlen, checksum);
1793  linesout++;
1794  nameleft= true;
1795  namewidth= 15; /* need MAX namewidth here... */
1796  sprintf(nameform, "%%+%ds ",namewidth);
1797  spacer = 11;
1798  width = 50;
1799  tab = 0; /* 1; */
1800  break;
1801 
1802  case kIG:
1803  fprintf(outf,";%s, %ld bases, %lX checksum.\n", seqname, seqlen, checksum);
1804  fprintf(outf,"%s\n", idword);
1805  strcpy(endstr,"1"); /* == linear dna */
1806  linesout += 2;
1807  break;
1808 
1809  default :
1810  case kZuker: /* don't attempt Zuker's ftn format */
1811  case kPearson:
1812  fprintf(outf,">%s, %ld bases, %lX checksum.\n", seqname, seqlen, checksum);
1813  linesout += 1;
1814  break;
1815  }
1816 
1817  if (*nameform==0) sprintf(nameform, "%%%d.%ds ",namewidth,namewidth);
1818  if (numline) sprintf(numform, "%%%ds ",numwidth);
1819  else sprintf(numform, "%%%dd ",numwidth);
1820  strcpy( nocountsymbols, kNocountsymbols);
1821  if (baseonlynum) {
1822  if (strchr(nocountsymbols,gPretty.gapchar)==NULL) {
1823  strcat(nocountsymbols," ");
1824  nocountsymbols[strlen(nocountsymbols)-1]= gPretty.gapchar;
1825  }
1826  if (gPretty.domatch && (cp=strchr(nocountsymbols,gPretty.matchchar))!=NULL) {
1827  *cp= ' ';
1828  }
1829  }
1830 
1831  if (numline) {
1832  *idword= 0;
1833  }
1834 
1835  width = min(width,kMaxseqwidth);
1836  for (i=0, l=0, ibase = 1; i < seqlen; ) {
1837 
1838  if (l1 < 0) l1 = 0;
1839  else if (l1 == 0) {
1840  if (nameleft) fprintf(outf, nameform, idword);
1841  if (numleft) { if (numline) fprintf(outf, numform, "");
1842  else fprintf(outf, numform, ibase);}
1843  for (j=0; j<tab; j++) fputc(' ',outf);
1844  }
1845 
1846  l1++; /* don't count spaces for width*/
1847  if (numline) {
1848  if (spacer==kSpaceAll || (spacer != 0 && (l+1) % spacer == 1)) {
1849  if (numline==1) fputc(' ',outf);
1850  s[l++] = ' ';
1851  }
1852  if (l1 % 10 == 1 || l1 == width) {
1853  if (numline==1) fprintf(outf,"%-9ld ",i+1);
1854  s[l++]= '|'; /* == put a number here */
1855  }
1856  else s[l++]= ' ';
1857  i++;
1858  }
1859 
1860  else {
1861  if (spacer==kSpaceAll || (spacer != 0 && (l+1) % spacer == 1))
1862  s[l++] = ' ';
1863  if (!baseonlynum) ibase++;
1864  else if (0==strchr(nocountsymbols,seq[i])) ibase++;
1865  s[l++] = seq[i++];
1866  }
1867 
1868  if (l1 == width || i == seqlen) {
1869  if (outform==kPretty) for ( ; l1<width; l1++) {
1870  if (spacer==kSpaceAll || (spacer != 0 && (l+1) % spacer == 1))
1871  s[l++] = ' ';
1872  s[l++]=' '; /* pad w/ blanks */
1873  }
1874  s[l] = '\0';
1875  l = 0; l1 = 0;
1876 
1877  if (numline) {
1878  if (numline==2) fprintf(outf,"%s",s); /* finish numberline ! and | */
1879  }
1880  else {
1881  if (i == seqlen) fprintf(outf,"%s%s",s,endstr);
1882  else fprintf(outf,"%s",s);
1883  if (numright || nameright) fputc(' ',outf);
1884  if (numright) fprintf(outf,numform, ibase-1);
1885  if (nameright) fprintf(outf, nameform,idword);
1886  }
1887  fputc('\n',outf);
1888  linesout++;
1889  }
1890  }
1891  return linesout;
1892 } /*writeSeq*/
1893 
1894 
1895 
1896 /* End file: ureadseq.c */
#define Local
Definition: ureadseq.c:72
char * readSeqFp(const short whichEntry_, FILE *fp_, const long skiplines_, const short format_, long *seqlen_, short *nseq_, short *error_, char *seqid_)
Definition: ureadseq.c:1035
#define skipwhitespace(string)
Definition: ureadseq.h:17
#define kZuker
Definition: ureadseq.h:57
long topseqlen
Definition: ureadseq.c:102
long linestart
Definition: ureadseq.c:107
char * seq
Definition: ureadseq.c:104
#define kASN1
Definition: ureadseq.h:66
boolean baseonlynum
Definition: ureadseq.h:84
static GB_ERROR tab(GBL_command_arguments *args, bool pretab)
Definition: adlang1.cxx:914
AliDataPtr format(AliDataPtr data, const size_t wanted_len, GB_ERROR &error)
Definition: insdel.cxx:615
int(* isseqcharfirst8)()
Definition: ureadseq.c:110
const unsigned long crctab[]
Definition: ureadseq.c:1439
#define kPhylip
Definition: ureadseq.h:62
#define tolowerstr(s)
Local void readEMBL(struct ReadSeqVars *V)
Definition: ureadseq.c:426
char matchchar
Definition: ureadseq.h:88
char * sp
Definition: ureadseq.c:108
Local void readOlsen(struct ReadSeqVars *V)
Definition: ureadseq.c:541
#define kStartLength
Definition: ureadseq.c:75
boolean numleft
Definition: ureadseq.h:85
#define kOlsen
Definition: ureadseq.h:58
Local void readMSF(struct ReadSeqVars *V)
Definition: ureadseq.c:624
Local boolean endPearson(boolean *addend, boolean *ungetend, struct ReadSeqVars *V)
Definition: ureadseq.c:397
Local boolean endFitch(boolean *addend, boolean *ungetend, struct ReadSeqVars *V)
Definition: ureadseq.c:472
boolean nameleft
Definition: ureadseq.h:86
Local void addseq(char *s, struct ReadSeqVars *V)
Definition: ureadseq.c:157
Local void readLoop(short margin, boolean addfirst, boolean(*endTest)(boolean *addend, boolean *ungetend, struct ReadSeqVars *V), struct ReadSeqVars *V)
Definition: ureadseq.c:216
#define fixmatchchar(s)
Local void readPAUPinterleaved(struct ReadSeqVars *V)
Definition: ureadseq.c:687
int isSeqChar(int c)
Definition: ureadseq.c:117
#define kAmino
Definition: ureadseq.h:46
const char * fname
Definition: ureadseq.c:103
Local void addinfo(char *s, struct ReadSeqVars *V)
Definition: ureadseq.c:196
Local void readline(FILE *f, char *s, long *linestart)
Definition: ureadseq.c:133
char gapchar
Definition: ureadseq.h:88
boolean domatch
Definition: ureadseq.h:87
char * compressSeq(const char gapc, const char *seq, const long seqlen, long *newlen)
Definition: ureadseq.c:1547
#define eMemFull
Definition: ureadseq.h:29
#define kUnknown
Definition: ureadseq.h:73
Local void readPhylipSequential(struct ReadSeqVars *V)
Definition: ureadseq.c:914
boolean filestart
Definition: ureadseq.c:105
#define kPhylip2
Definition: ureadseq.h:59
#define kRNA
Definition: ureadseq.h:44
int Strncasecmp(const char *a, const char *b, long maxn)
Definition: ureadseq.c:50
const char * aminos
Definition: ureadseq.c:77
short err
Definition: ureadseq.c:99
prettyopts gPretty
Definition: ureadseq.c:31
int Strcasecmp(const char *a, const char *b)
Definition: ureadseq.c:36
const char stdsymbols[6]
Definition: ureadseq.c:82
long maxseq
Definition: ureadseq.c:100
#define kASNseqset
Definition: ureadseq.h:77
FILE * seq
Definition: rns.c:46
char * listSeqs(const char *filename_, const long skiplines_, const short format_, short *nseq_, short *error_)
Definition: ureadseq.c:1136
#define eNoData
Definition: ureadseq.h:28
const char nonummask[11]
Definition: ureadseq.c:87
static const char * seqsymbols
Definition: ureadseq.c:84
Local void readStrider(struct ReadSeqVars *V)
Definition: ureadseq.c:283
Local void countseq(char *s, struct ReadSeqVars *V)
Definition: ureadseq.c:183
Local boolean endIG(boolean *addend, boolean *ungetend, struct ReadSeqVars *V)
Definition: ureadseq.c:247
Local void readZuker(struct ReadSeqVars *V)
Definition: ureadseq.c:453
#define kMSF
Definition: ureadseq.h:65
short namewidth
Definition: ureadseq.h:90
#define kNBRF
Definition: ureadseq.h:51
static int diff(int v1, int v2, int v3, int v4, int st, int en)
Definition: ClustalV.cxx:534
#define kPlain
Definition: ureadseq.h:63
#define kStrider
Definition: ureadseq.h:54
long seqlencount
Definition: ureadseq.c:100
Local void readNBRF(struct ReadSeqVars *V)
Definition: ureadseq.c:381
FILE * f
Definition: ureadseq.c:106
Local void readIG(struct ReadSeqVars *V)
Definition: ureadseq.c:254
#define eFileNotFound
Definition: ureadseq.h:27
boolean allDone
Definition: ureadseq.c:105
Local void readPIR(struct ReadSeqVars *V)
Definition: ureadseq.c:308
Local void readSeqMain(struct ReadSeqVars *V, const long skiplines_, const short format_)
Definition: ureadseq.c:939
Local boolean endPIR(boolean *addend, boolean *ungetend, struct ReadSeqVars *V)
Definition: ureadseq.c:301
#define kNoformat
Definition: ureadseq.h:72
#define kPAUP
Definition: ureadseq.h:67
boolean nameright
Definition: ureadseq.h:86
const char kNocountsymbols[5]
Definition: ureadseq.c:81
short seqwidth
Definition: ureadseq.h:91
Local boolean endPhylipSequential(boolean *addend, boolean *ungetend, struct ReadSeqVars *V)
Definition: ureadseq.c:906
static void error(const char *msg)
Definition: mkptypes.cxx:96
fputc('\n', stderr)
#define kGenBank
Definition: ureadseq.h:50
short choice
Definition: ureadseq.c:99
short atseq
Definition: ureadseq.h:89
#define kIG
Definition: ureadseq.h:49
boolean interleaved
Definition: readseq.c:459
char s[256]
Definition: ureadseq.c:108
Local void readFitch(struct ReadSeqVars *V)
Definition: ureadseq.c:482
#define kASNseqentry
Definition: ureadseq.h:76
long outform
Definition: readseq.c:462
#define kNucleic
Definition: ureadseq.h:45
#define kMaxseqwidth
#define kGCG
Definition: ureadseq.h:53
Local boolean endStrider(boolean *addend, boolean *ungetend, struct ReadSeqVars *V)
Definition: ureadseq.c:276
char * seqid
Definition: ureadseq.c:104
boolean nametop
Definition: ureadseq.h:86
Local boolean endZuker(boolean *addend, boolean *ungetend, struct ReadSeqVars *V)
Definition: ureadseq.c:446
const char nummask[11]
Definition: ureadseq.c:86
boolean addit
Definition: ureadseq.c:105
#define kFitch
Definition: ureadseq.h:55
Local void readPAUPsequential(struct ReadSeqVars *V)
Definition: ureadseq.c:776
#define kOtherSeq
Definition: ureadseq.h:42
short writeSeq(FILE *outf, const char *seq, const long seqlen, const short outform, const char *seqid)
Definition: ureadseq.c:1585
#define kEMBL
Definition: ureadseq.h:52
unsigned long CRC32checksum(const char *seq, const long seqlen, unsigned long *checktotal)
Definition: ureadseq.c:1494
int(* isseqchar)()
Definition: ureadseq.c:111
boolean done
Definition: ureadseq.c:105
int isAnyChar(int c)
Definition: ureadseq.c:128
#define to_upper(c)
Definition: ureadseq.h:23
#define kListSequences
Definition: ureadseq.h:39
#define eUnknownFormat
Definition: ureadseq.h:34
Local void readGenBank(struct ReadSeqVars *V)
Definition: ureadseq.c:336
#define kPhylip4
Definition: ureadseq.h:60
Local void GetLine(struct ReadSeqVars *V)
Definition: ureadseq.c:146
#define kDNA
Definition: ureadseq.h:43
int isSeqNumChar(int c)
Definition: ureadseq.c:122
short topnseq
Definition: ureadseq.c:101
char(& no)[2]
Definition: downcast.h:31
Local void readPlain(struct ReadSeqVars *V)
Definition: ureadseq.c:496
short seqFileFormatFp(FILE *fseq, long *skiplines, short *error)
Definition: ureadseq.c:1167
#define seqchecksum
Definition: ureadseq.h:139
short seqFileFormat(const char *filename, long *skiplines, short *error)
Definition: ureadseq.c:1153
short spacer
Definition: ureadseq.h:91
unsigned long GCGchecksum(const char *seq, const long seqlen, unsigned long *checktotal)
Definition: ureadseq.c:1422
const char * protonly
Definition: ureadseq.c:79
Local void unGetLine(struct ReadSeqVars *V)
Definition: ureadseq.c:151
Local boolean endGB(boolean *addend, boolean *ungetend, struct ReadSeqVars *V)
Definition: ureadseq.c:329
Local void readPearson(struct ReadSeqVars *V)
Definition: ureadseq.c:404
#define kPearson
Definition: ureadseq.h:56
short nseq
Definition: ureadseq.c:99
const char allsymbols[32]
Definition: ureadseq.c:83
#define kMaxFormat
Definition: ureadseq.h:70
#define ReadOneLine(sp)
Local boolean endNBRF(boolean *addend, boolean *ungetend, struct ReadSeqVars *V)
Definition: ureadseq.c:361
Local void readPhylipInterleaved(struct ReadSeqVars *V)
Definition: ureadseq.c:853
long seqlen
Definition: ureadseq.c:100
char * readSeq(const short whichEntry_, const char *filename_, const long skiplines_, const short format_, long *seqlen_, short *nseq_, short *error_, char *seqid_)
Definition: ureadseq.c:1084
#define min(a, b)
Definition: f2c.h:153
short getseqtype(const char *seq, const long seqlen)
Definition: ureadseq.c:1513
#define kPretty
Definition: ureadseq.h:68
char matchchar
Definition: ureadseq.c:104
#define kPIR
Definition: ureadseq.h:64
Local boolean endEMBL(boolean *addend, boolean *ungetend, struct ReadSeqVars *V)
Definition: ureadseq.c:419
short numline
Definition: ureadseq.h:89
const char * primenuc
Definition: ureadseq.c:78
Local void readUWGCG(struct ReadSeqVars *V)
Definition: ureadseq.c:514
short tab
Definition: ureadseq.h:91
boolean numright
Definition: ureadseq.h:85
GB_write_int const char s
Definition: AW_awar.cxx:154