22 =
"readSeq (1Feb93), multi-format molbio sequence reader.\n";
192 " 9. Zuker (in-only)",
193 "10. Olsen (in-only)",
201 "18. Pretty (out-only)",
204 #define kFormCount 30
205 #define kMaxFormName 15
247 const char compl[] =
" !\"#$%&'()*+,-./0123456789:;<=>?@TVGHNNCDNNMNKNNYRYSAABWNRN[\\]^_`tvghnncdnnmnknnyrysaabwnrn{|}~";
259 default:
return "(unknown)";
262 else return formats[format-1];
272 int maxlen, i, match, matchat;
277 namelen = strlen(name2);
288 for (i=0; i<maxlen; i++) lname[i] =
to_lower(name2[i]);
293 match = strncmp( lname,
formname[i].name, maxlen);
296 else if (matchat ==
kNoformat) matchat = i;
313 listlen = strlen(list);
314 printf(
"Sequences in %s (format is %s)\n", inputfile,
formatstr(format));
315 for (i=0, l=0; i < listlen; i++) {
316 if (list[i] == (
char)
NEWLINE) {
334 "usage: readseq [-options] in.seq > out.seq\n");
335 fprintf(stderr,
" options\n");
337 fprintf(stderr,
" -a[ll] select All sequences\n");
338 fprintf(stderr,
" -c[aselower] change to lower case\n");
339 fprintf(stderr,
" -C[ASEUPPER] change to UPPER CASE\n");
340 fprintf(stderr,
" -degap[=-] remove gap symbols\n");
341 fprintf(stderr,
" -i[tem=2,3,4] select Item number(s) from several\n");
342 fprintf(stderr,
" -l[ist] List sequences only\n");
343 fprintf(stderr,
" -o[utput=]out.seq redirect Output\n");
344 fprintf(stderr,
" -p[ipe] Pipe (command line, <stdin, >stdout)\n");
345 fprintf(stderr,
" -r[everse] change to Reverse-complement\n");
346 fprintf(stderr,
" -v[erbose] Verbose progress\n");
347 fprintf(stderr,
" -f[ormat=]# Format number for output, or\n");
348 fprintf(stderr,
" -f[ormat=]Name Format name for output:\n");
351 fprintf( stderr,
" %-20s %-20s\n",
352 formats[i], formats[midi+i]);
355 fprintf(stderr,
" \n");
356 fprintf(stderr,
" Pretty format options: \n");
357 fprintf(stderr,
" -wid[th]=# sequence line width\n");
358 fprintf(stderr,
" -tab=# left indent\n");
359 fprintf(stderr,
" -col[space]=# column space within sequence line on output\n");
360 fprintf(stderr,
" -gap[count] count gap chars in sequence numbers\n");
361 fprintf(stderr,
" -nameleft, -nameright[=#] name on left/right side [=max width]\n");
362 fprintf(stderr,
" -nametop name at top/bottom\n");
363 fprintf(stderr,
" -numleft, -numright seq index on left/right side\n");
364 fprintf(stderr,
" -numtop, -numbot index on top/bottom\n");
365 fprintf(stderr,
" -match[=.] use match base for 2..n species\n");
366 fprintf(stderr,
" -inter[line=#] blank line(s) between sequence blocks\n");
384 case eFileNotFound: fprintf(stderr,
"arb_readseq: File not found: %s\n", inputfile);
386 case eFileCreate: fprintf(stderr,
"arb_readseq: Can't open output file.\n");
388 case eASNerr: fprintf(stderr,
"arb_readseq: Error in ASN.1 sequence routines.\n");
390 case eNoData: fprintf(stderr,
"arb_readseq: No data in file.\n");
392 case eItemNotFound: fprintf(stderr,
"arb_readseq: Specified item not in file.\n");
394 case eUnequalSize: fprintf(stderr,
"arb_readseq: This format requires equal length sequences.\nSequence truncated or padded to fit.\n");
396 case eUnknownFormat: fprintf(stderr,
"arb_readseq: Error: this format is unknown to me.\n");
398 case eOneFormat: fprintf(stderr,
"arb_readseq: Warning: This format permits only 1 sequence per file.\n");
400 case eMemFull: fprintf(stderr,
"arb_readseq: Out of storage memory. Sequence truncated.\n");
402 case ePipeStdin: fprintf(stderr,
"arb_readseq: piping from stdin is prohibited.\n");
404 default: fprintf(stderr,
"arb_readseq: errorcode = %d\n", err);
409 #define USERINPUT_BUFFERSIZE 128
410 #define USERINPUT_BUFFERSIZE_SEQ 256
422 fprintf( stderr,
" %-20s %-20s\n",
423 formats[i], formats[midi+i]);
424 fprintf(stderr,
"\nChoose an output format (name or #): \n");
436 boolean checkopt(
boolean casesense,
char *sopt,
const char *smatch,
short minword)
438 long lenopt, lenmatch;
442 lenopt = strlen(sopt);
443 lenmatch= strlen(smatch);
444 minmaxw=
max(minword,
min(lenopt, lenmatch));
447 result= (!strncmp( sopt, smatch, minmaxw));
457 #define kMaxwhichlist 50
492 char sparamstore[256], *sparam= sparamstore;
502 else if (*sopt ==
'-') {
505 char *cp= strchr(sopt,
'=');
508 strcpy(sparam, cp+1);
513 if (
checkopt(
false, sopt,
"-help", 2)) {
518 if (
checkopt(
false, sopt,
"-all", 2)) {
523 if (
checkopt(
false, sopt,
"-colspace", 4)) {
529 if (
checkopt(
true, sopt,
"-caselower", 2)) {
533 if (
checkopt(
true, sopt,
"-CASEUPPER", 2)) {
538 if (
checkopt(
false, sopt,
"-pipe", 2)) {
543 if (
checkopt(
false, sopt,
"-list", 2)) {
548 if (
checkopt(
false, sopt,
"-reverse", 2)) {
553 if (
checkopt(
false, sopt,
"-verbose", 2)) {
558 if (
checkopt(
false, sopt,
"-match", 5)) {
563 if (
checkopt(
false, sopt,
"-degap", 4)) {
569 if (
checkopt(
false, sopt,
"-interline", 4)) {
574 if (
checkopt(
false, sopt,
"-item", 2)) {
578 if (*cp == 0) cp= sopt+2;
592 if (
checkopt(
false, sopt,
"-format", 5)) {
593 if (*sparam==0) {
for (sparam= sopt+2; isalpha(*sparam); sparam++) ; }
597 if (
checkopt(
false, sopt,
"-f", 2)) {
598 if (*sparam==0) sparam= sopt+2;
603 if (
checkopt(
false, sopt,
"-output", 3)) {
604 if (*sparam==0) {
for (sparam= sopt+3; isalpha(*sparam); sparam++) ; }
605 strcpy( oname, sparam);
606 foo = fopen( oname,
"w");
612 if (
checkopt(
false, sopt,
"-o", 2)) {
613 if (*sparam==0) sparam= sopt+2;
614 strcpy( oname, sparam);
615 foo = fopen( oname,
"w");
622 if (
checkopt(
false, sopt,
"-width", 2)) {
623 if (*sparam==0) {
for (sparam= sopt+2; !
rs_isdigit(*sparam) && *sparam!=0; sparam++) ; }
629 if (
checkopt(
false, sopt,
"-tab", 4)) {
630 if (*sparam==0) {
for (sparam= sopt+2; !
rs_isdigit(*sparam) && *sparam!=0; sparam++) ; }
636 if (
checkopt(
false, sopt,
"-gapcount", 4)) {
641 if (
checkopt(
false, sopt,
"-nointerleave", 8)) {
646 if (
checkopt(
false, sopt,
"-nameleft", 7)) {
647 if (*sparam==0) {
for (sparam= sopt+2; !
rs_isdigit(*sparam) && *sparam!=0; sparam++) ; }
653 if (
checkopt(
false, sopt,
"-nameright", 7)) {
654 if (*sparam==0) {
for (sparam= sopt+2; !
rs_isdigit(*sparam) && *sparam!=0; sparam++) ; }
660 if (
checkopt(
false, sopt,
"-nametop", 6)) {
665 if (
checkopt(
false, sopt,
"-numleft", 6)) {
666 if (*sparam==0) {
for (sparam= sopt+2; !
rs_isdigit(*sparam) && *sparam!=0; sparam++) ; }
672 if (
checkopt(
false, sopt,
"-numright", 6)) {
673 if (*sparam==0) {
for (sparam= sopt+2; !
rs_isdigit(*sparam) && *sparam!=0; sparam++) ; }
680 if (
checkopt(
false, sopt,
"-numtop", 6)) {
684 if (
checkopt(
false, sopt,
"-numbottom", 6)) {
696 strcpy( inputfile, sopt);
714 #define Exit(a) return(a)
715 siow_main(
int argc,
char *argv[])
718 #define Exit(a) exit(a)
720 int main(
int argc,
char *argv[])
723 boolean closein =
false;
724 short ifile, nseq, atseq,
format, err = 0, seqtype =
kDNA,
725 nlines, seqout = 0, phylvers = 2;
726 long i, skiplines, seqlen, seqlen0;
727 unsigned long checksum= 0, checkall= 0;
728 char *
seq, *firstseq = NULL, *seqlist, tempname[256];
730 char seqid[256], *seqidptr = seqid;
732 FILE *ftmp, *fin = NULL, *fout = NULL;
733 long outindexmax= 0, noutindex= 0, *outindex = NULL;
735 #define exit_main(err) { \
736 if (closeout) fclose(fout); \
737 if (closein) fclose(fin); \
738 if (*tempname!=0) remove(tempname);\
741 #define indexout() if (interleaved) {\
742 if (noutindex>=outindexmax) {\
743 outindexmax= noutindex + 20;\
744 outindex= (long*) realloc(outindex, sizeof(long)*outindexmax);\
745 if (outindex==NULL) { err= eMemFull; erralert(err); exit_main(err); }\
747 outindex[noutindex++]= ftell(fout);\
757 for (i=1; i < argc; i++) {
771 inputfile = tempname;
772 ftmp = fopen( inputfile,
"w");
774 while ((c = getc(stdin)) != EOF)
fputc(c, ftmp);
788 fprintf(stderr,
"\nName of output file (?=help, defaults to display): \n");
792 else if (*oname != 0) {
794 foo = fopen( oname,
"w");
830 fout = ftmp = tmpfile();
831 outindexmax= 30; noutindex= 0;
832 outindex = (
long*) malloc(outindexmax*
sizeof(
long));
841 if (*argv[ifile] !=
'-') {
842 strcpy( inputfile, argv[ifile]);
850 fprintf(stderr,
"\nName an input sequence or -option: \n");
854 if (*stemp==0)
goto fini;
855 stemp= strtok(stempstore,
" \n\r\t");
859 stemp= strtok( NULL,
" \n\r\t");
864 if (*inputfile == 0)
break;
871 seqlist = listASNSeqs( inputfile, skiplines, format, &nseq, &err);
874 seqlist =
listSeqs( inputfile, skiplines, format, &nseq, &err);
898 fprintf(stderr,
"\nChoose a sequence (# or All): \n");
913 fin = fopen(inputfile,
"r");
924 sprintf( stemp,
"%s_%ld", oname,
whichSeq);
925 freopen( stemp,
"w", fout);
926 fprintf( stderr,
"Writing sequence %ld to file %s\n",
whichSeq, stemp);
937 &seqlen, &atseq, &err, seqidptr);
946 seq = readASNSeq(
whichSeq, inputfile, skiplines, format,
947 &seqlen, &atseq, &err, &seqidptr);
952 &seqlen, &atseq, &err, seqidptr);
961 free(seq); seq= newseq; seqlen= newlen;
968 fprintf( stderr,
"Sequence %ld, length= %ld, checksum= %lX, format= %s, id= %s\n",
976 if (seqout == 0) fprintf(
foo,
"\\\\\\\n");
979 if (seqout == 0)
fputs(kASN1headline,
foo);
985 if (chooseall) i= nseq;
else i=1;
986 if (phylvers >= 4) fprintf(
foo,
" %ld %ld\n", i, seqlen);
987 else fprintf(
foo,
" %ld %ld YF\n", i, seqlen);
991 else if (seqlen != seqlen0) {
993 if (seqlen < seqlen0) seq = (
char *)realloc(seq, seqlen0);
1005 else if (seqlen != seqlen0) {
1007 if (seqlen < seqlen0) seq = (
char *)realloc(seq, seqlen0);
1017 for (i = 0; i<seqlen; i++) seq[i] =
to_upper(seq[i]);
1019 for (i = 0; i<seqlen; i++) seq[i] =
to_lower(seq[i]);
1022 for (i = 0; i<seqlen; i++)
if (seq[i] ==
'.') seq[i] =
'?';
1028 for (j=0, k=seqlen-1; j <= k; j++, k--) {
1029 ctemp = compl[seq[j] -
' '];
1030 seq[j] = compl[seq[k] -
' '];
1036 for (i=0; i<seqlen; i++){
1062 else if (seq!=NULL) { free(seq); seq = NULL; }
1066 && seqidptr && seqidptr!= seqid)
1073 if (closein) { fclose(fin); closein=
false; }
1080 if (firstseq) { free(firstseq); firstseq= NULL; }
1095 fprintf(
foo,
"\n %s MSF: %ld Type: N January 01, 1776 12:00 Check: %lu ..\n\n",
1096 cp, seqlen, checkall);
1100 fprintf(
foo,
"#NEXUS\n");
1102 fprintf(
foo,
"[%s -- data title]\n\n", cp);
1107 if (phylvers >= 4) fprintf(
foo,
" %d %ld\n", seqout, seqlen);
1108 else fprintf(
foo,
" %d %ld YF\n", seqout, seqlen);
1114 short iline, j, leaf, iseq;
1115 char *
s = stempstore;
1119 for (leaf=0; leaf<nlines; leaf++) {
1125 case kDNA : cp=
"dna";
break;
1126 case kRNA : cp=
"rna";
break;
1128 case kAmino : cp=
"protein";
break;
1131 fprintf(
foo,
"\nbegin data;\n");
1132 fprintf(
foo,
" dimensions ntax=%d nchar=%ld;\n", seqout, seqlen);
1134 fprintf(
foo,
" format datatype=%s interleave=yes missing=. gap=%c", cp,
gPretty.
gapchar);
1136 fprintf(
foo,
";\n matrix\n");
1139 for (iseq=0; iseq<noutindex; iseq++) {
1140 fseek(ftmp, outindex[iseq], 0);
1141 for (iline=0; iline<=leaf; iline++)
1142 if (!fgets(s, 256, ftmp)) *s= 0;
1143 if (ftell(ftmp) <= outindex[iseq+1])
1157 if (outindex != NULL) free(outindex);
char * readSeqFp(const short whichEntry_, FILE *fp_, const long skiplines_, const short format_, long *seqlen_, short *nseq_, short *error_, char *seqid_)
#define skipwhitespace(string)
AliDataPtr format(AliDataPtr data, const size_t wanted_len, GB_ERROR &error)
int main(int argc, char *argv[])
char * compressSeq(const char gapc, const char *seq, const long seqlen, long *newlen)
#define USERINPUT_BUFFERSIZE_SEQ
int Strncasecmp(const char *a, const char *b, long maxn)
char * listSeqs(const char *filename_, const long skiplines_, const short format_, short *nseq_, short *error_)
short whichlist[kMaxwhichlist+1]
const char * kASN1headline
#define USERINPUT_BUFFERSIZE
#define kPhylipInterleave
char onamestore[USERINPUT_BUFFERSIZE]
const char * formatstr(short format)
#define kPhylipSequential
const char * formats[kMaxFormat+1]
boolean checkopt(boolean casesense, char *sopt, const char *smatch, short minword)
fputs(TRACE_PREFIX, stderr)
short writeSeq(FILE *outf, const char *seq, const long seqlen, const short outform, const char *seqid)
const struct formatTable formname[]
int chooseFormat(boolean quietly)
int parseformat(char *name2)
short seqFileFormat(const char *filename, long *skiplines, short *error)
unsigned long GCGchecksum(const char *seq, const long seqlen, unsigned long *checktotal)
static char inputfilestore[256]
static char * gets_noOverflow(char *buffer, int buffersize)
static void dumpSeqList(char *list, short format)
char * readSeq(const short whichEntry_, const char *filename_, const long skiplines_, const short format_, long *seqlen_, short *nseq_, short *error_, char *seqid_)
short getseqtype(const char *seq, const long seqlen)
GB_write_int const char s