ARB
gcg.cxx
Go to the documentation of this file.
1 #include "genbank.h"
2 #include "embl.h"
3 #include "macke.h"
4 
5 static void gcg_doc_out(const char *line, Writer& writer) {
6  // Output non-sequence data(document) of gcg format.
7  int indi, len;
8  int previous_is_dot;
9 
10  ca_assert(writer.ok());
11 
12  for (indi = 0, len = str0len(line), previous_is_dot = 0; indi < len; indi++) {
13  if (previous_is_dot) {
14  if (line[indi] == '.')
15  writer.out(' ');
16  else
17  previous_is_dot = 0;
18  }
19  writer.out(line[indi]);
20  if (line[indi] == '.')
21  previous_is_dot = 1;
22  }
23 }
24 
25 static int gcg_checksum(const char *Str, int numofstr) {
26  // Calculate gcg_checksum for GCG format.
27  int cksum = 0;
28  int count = 0;
29  for (int indi = 0; indi < numofstr; indi++) {
30  if (!is_gapchar(Str[indi])) {
31  count++;
32  cksum = ((cksum + count * toupper(Str[indi])) % 10000);
33  if (count == 57) count = 0;
34  }
35  }
36  return cksum;
37 }
38 
39 static void gcg_out_origin(const Seq& seq, Writer& write) {
40  // Output sequence data in gcg format.
41  int indi, indj, indk;
42  const char *sequence = seq.get_seq();
43 
44  for (indi = 0, indj = 0, indk = 1; indi < seq.get_len(); indi++) {
45  if (!is_gapchar(sequence[indi])) {
46  if ((indk % 50) == 1) write.outf("%8d ", indk);
47  write.out(sequence[indi]);
48  indj++;
49  if (indj == 10) {
50  write.out(' ');
51  indj = 0;
52  }
53  if ((indk % 50) == 0) write.out("\n\n");
54  indk++;
55  }
56  }
57  if ((indk % 50) != 1) write.out(" \n");
58 }
59 
60 static void gcg_seq_out(const Seq& seq, Writer& write, const char *key) {
61  // Output sequence data in gcg format.
62  write.outf("\n%s Length: %d %s Type: N Check: %d ..\n\n",
63  key,
64  seq.get_len()-seq.count_gaps(),
66  gcg_checksum(seq.get_seq(), seq.get_len()));
67  gcg_out_origin(seq, write);
68 }
69 
70 class GcgWriter;
71 
72 class GcgCommentWriter : public Writer {
73  GcgWriter& gcg_writer;
74 
75  char linebuf[LINESIZE];
76  int used;
77 public:
79  : gcg_writer(gcg_writer_),
80  used(0)
81  {}
83  ca_assert(used == 0); // trailing \n has not been written
84  }
85 
86  bool ok() const OVERRIDE { return true; }
87  void throw_write_error() const OVERRIDE { ca_assert(0); }
88  void out(char ch) OVERRIDE;
89  const char *name() const OVERRIDE { return "comment-writer"; }
90 };
91 
92 class GcgWriter : public FileWriter { // derived from a Noncopyable
93  char *species_name;
94  bool seq_written; // if true, any further sequences are ignored
95 
96  GcgCommentWriter writer;
97 
98 public:
99  GcgWriter(const char *outname)
100  : FileWriter(outname),
101  species_name(NULp),
102  seq_written(false),
103  writer(*this)
104  {}
105  ~GcgWriter() OVERRIDE { free(species_name); }
106 
107  void set_species_name(const char *next_name) {
108  if (!seq_written) species_name = nulldup(next_name);
109  else warningf(111, "Species '%s' dropped (GCG allows only 1 sequence per file)", next_name);
110  }
111 
112  void add_comment(const char *comment) {
113  if (!seq_written) gcg_doc_out(comment, *this);
114  }
115 
117  ca_assert(!seq_written);
118  return writer;
119  }
120 
121  void write_seq_data(const Seq& seq) {
122  if (!seq_written) {
123  ca_assert(species_name); // you have to call set_species_name() before!
124  gcg_seq_out(seq, *this, species_name);
125  seq_written = true;
126  }
127  }
128 
129  void expect_written() {
130  FileWriter::seq_done(seq_written);
131  seq_written = false;
133  }
134 };
135 
136 void GcgCommentWriter::out(char ch) {
137  linebuf[used++] = ch;
138  ca_assert(used<LINESIZE);
139  if (ch == '\n') {
140  linebuf[used] = 0;
141  gcg_writer.add_comment(linebuf);
142  used = 0;
143  }
144 }
145 
146 static void macke_to_gcg(const char *inf, const char *outf) {
147  MackeReader reader(inf);
148  GcgWriter out(outf);
149 
150  Seq seq;
151  if (reader.read_one_entry(seq)) {
152  Macke& macke = dynamic_cast<Macke&>(reader.get_data());
153  out.set_species_name(macke.get_id());
154  macke_seq_info_out(macke, out);
155  out.write_seq_data(seq);
156 
157  reader.ignore_rest_of_file();
158  }
159  out.expect_written();
160 }
161 
162 static void genbank_to_gcg(const char *inf, const char *outf) {
164  GcgWriter write(outf);
165 
166  GenBank gbk;
167  Seq seq;
168 
169  GenbankReader& greader = dynamic_cast<GenbankReader&>(*reader);
170  if (GenbankParser(gbk, seq, greader).parse_entry()) {
171  genbank_out_header(gbk, seq, write.comment_writer());
173  write.out("ORIGIN\n");
174  write.set_species_name(gbk.get_id());
175  write.write_seq_data(seq);
176 
177  reader->ignore_rest_of_file();
178  }
179  write.expect_written();
180 }
181 
182 static void embl_to_gcg(const char *inf, const char *outf) {
183  EmblSwissprotReader reader(inf);
184  GcgWriter write(outf);
185 
186  Embl embl;
187  Seq seq;
188 
189  if (EmblParser(embl, seq, reader).parse_entry()) {
190  embl_out_header(embl, seq, write);
191  write.set_species_name(embl.get_id());
192  write.write_seq_data(seq);
193 
194  reader.ignore_rest_of_file();
195  }
196  write.expect_written();
197 }
198 
199 void to_gcg(const FormattedFile& in, const char *outf) {
200  // Convert from whatever to GCG format
201  // @@@ use InputFormat ?
202 
203  switch (in.type()) {
204  case MACKE: macke_to_gcg(in.name(), outf); break;
205  case GENBANK: genbank_to_gcg(in.name(), outf); break;
206  case EMBL:
207  case SWISSPROT: embl_to_gcg(in.name(), outf); break;
208  default:
210  break;
211  }
212 }
213 
214 
CONSTEXPR_INLINE int str0len(const char *str)
Definition: global.h:98
static int gcg_checksum(const char *Str, int numofstr)
Definition: gcg.cxx:25
static void gcg_out_origin(const Seq &seq, Writer &write)
Definition: gcg.cxx:39
GcgWriter(const char *outname)
Definition: gcg.cxx:99
void seq_done()
Definition: reader.h:148
virtual bool ok() const =0
void out(char ch) FINAL_OVERRIDE
Definition: reader.h:139
static void embl_to_gcg(const char *inf, const char *outf)
Definition: gcg.cxx:182
GcgCommentWriter(GcgWriter &gcg_writer_)
Definition: gcg.cxx:78
Format type() const
Definition: fun.h:62
void warningf(int warning_num, const char *warning_messagef,...) __ATTR__FORMAT(2)
Definition: util.cxx:66
void expect_written()
Definition: gcg.cxx:129
void write_seq_data(const Seq &seq)
Definition: gcg.cxx:121
#define ca_assert(cond)
Definition: global.h:33
~GcgCommentWriter() OVERRIDE
Definition: gcg.cxx:82
void expect_written()
Definition: reader.cxx:97
void to_gcg(const FormattedFile &in, const char *outf)
Definition: gcg.cxx:199
Definition: fun.h:19
Definition: reader.h:95
static void macke_to_gcg(const char *inf, const char *outf)
Definition: gcg.cxx:146
void throw_write_error() const OVERRIDE
Definition: gcg.cxx:87
static void gcg_seq_out(const Seq &seq, Writer &write, const char *key)
Definition: gcg.cxx:60
FILE * seq
Definition: rns.c:46
void ignore_rest_of_file() OVERRIDE
Definition: reader.h:85
const char * name() const
Definition: fun.h:61
static void genbank_to_gcg(const char *inf, const char *outf)
Definition: gcg.cxx:162
static void gcg_doc_out(const char *line, Writer &writer)
Definition: gcg.cxx:5
static SmartPtr< FormatReader > create(const FormattedFile &in)
Definition: convert.cxx:13
InputFormat & get_data() OVERRIDE
Definition: macke.h:186
~GcgWriter() OVERRIDE
Definition: gcg.cxx:105
Generic smart pointer.
Definition: smartptr.h:149
void ignore_rest_of_file() OVERRIDE
Definition: macke.h:185
bool ok() const OVERRIDE
Definition: gcg.cxx:86
#define false
Definition: ureadseq.h:13
Definition: fun.h:12
Definition: seq.h:43
virtual void out(char ch)=0
Writer & comment_writer()
Definition: gcg.cxx:116
void set_species_name(const char *next_name)
Definition: gcg.cxx:107
virtual int outf(const char *format,...) __ATTR__FORMAT_MEMBER(1)
Definition: reader.cxx:121
Definition: fun.h:14
const char * gcg_date(const char *input)
Definition: date.cxx:232
void genbank_out_base_count(const Seq &seq, Writer &write)
Definition: genbank.cxx:440
Definition: fun.h:13
void throw_conversion_not_supported(Format inType, Format ouType)
Definition: fconv.cxx:27
void add_comment(const char *comment)
Definition: gcg.cxx:112
int get_len() const
Definition: seq.h:107
void embl_out_header(const Embl &embl, const Seq &seq, Writer &write)
Definition: embl.cxx:315
Definition: fun.h:15
const char * today_date()
Definition: date.cxx:214
#define OVERRIDE
Definition: cxxforward.h:93
bool is_gapchar(char ch)
Definition: global.h:119
void genbank_out_header(const GenBank &gbk, const Seq &seq, Writer &write)
Definition: genbank.cxx:368
int count_gaps() const
Definition: seq.h:120
static int line
Definition: arb_a2ps.c:296
#define NULp
Definition: cxxforward.h:97
const char * name() const OVERRIDE
Definition: gcg.cxx:89
const char * get_seq() const
Definition: seq.h:110
void out(char ch) OVERRIDE
Definition: gcg.cxx:136
bool read_one_entry(Seq &seq) OVERRIDE __ATTR__USERESULT
Definition: macke.cxx:356
#define LINESIZE
Definition: defs.h:16
void macke_seq_info_out(const Macke &macke, Writer &write)
Definition: macke.cxx:162