ARB
embl.cxx
Go to the documentation of this file.
1 #include "embl.h"
2 #include "genbank.h"
3 #include "macke.h"
4 #include "wrap.h"
5 
6 static void embl_continue_line(const char *pattern, char*& Str, Reader& reader) {
7  // if there are (numb) blanks at the beginning of line,
8  // it is a continue line of the current command.
9  int ind;
10  char key[TOKENSIZE], temp[LINESIZE];
11 
12  // check continue lines
13  for (++reader; reader.line(); ++reader) {
14  if (has_content(reader.line())) {
15  embl_key_word(reader.line(), 0, key);
16  if (!str_equal(pattern, key)) break;
17 
18  // remove end-of-line, if there is any
19  ind = Skip_white_space(reader.line(), p_nonkey_start);
20  strcpy(temp, reader.line() + ind);
22  }
23  }
24 }
25 
26 static void embl_one_entry(Reader& reader, char*& entry, const char *key) {
27  // Read in one embl entry lines.
28  int index = Skip_white_space(reader.line(), p_nonkey_start);
29  freedup(entry, reader.line() + index);
30  embl_continue_line(key, entry, reader);
31 }
32 
33 static void embl_date(Embl& embl, Reader& reader) {
34  // Read in embl DATE lines.
35  int index = Skip_white_space(reader.line(), p_nonkey_start);
36  freedup(embl.dateu, reader.line() + index);
37 
38  ++reader;
39 
40  char key[TOKENSIZE];
41  embl_key_word(reader.line(), 0, key);
42  if (str_equal(key, "DT")) {
43  index = Skip_white_space(reader.line(), p_nonkey_start);
44  freedup(embl.datec, reader.line() + index);
45  // skip the rest of DT lines
46  do {
47  ++reader;
48  if (!reader.line()) break;
49  embl_key_word(reader.line(), 0, key);
50  }
51  while (str_equal(key, "DT"));
52  }
53  else {
54  // always expect more than two DT lines
55  warning(33, "one DT line is missing");
56  }
57 }
58 
59 static void embl_correct_title(Emblref& ref) {
60  // Check missing '"' at the both ends
61 
62  terminate_with(ref.title, ';');
63 
64  int len = str0len(ref.title);
65  if (len > 2 && (ref.title[0] != '"' || ref.title[len - 3] != '"')) {
66  char *temp = NULp;
67  if (ref.title[0] != '"')
68  temp = ARB_strdup("\"");
69  else
70  temp = ARB_strdup("");
71  Append(temp, ref.title);
72  if ((len > 2 && ref.title[len - 3]
73  != '"')) {
74  len = str0len(temp);
75  temp[len - 2] = '"';
76  terminate_with(temp, ';');
77  }
78  freedup(ref.title, temp);
79  free(temp);
80  }
81 }
82 
83 int comment_subkey(const char *line, char *key) {
84  // Get the subkey-word (including delimiting ':') from a comment line
85  int len = parse_key_word(line, key, ":\t\n(");
86  if (!len) return 0;
87 
88  if (line[len] == ':') {
89  key[len] = ':';
90  key[len+1] = 0;
91  }
92  return len+1;
93 }
94 
95 inline bool is_embl_comment(const char *line) { return line && line[0] == 'C' && line[1] == 'C'; }
96 
97 static void embl_one_comment_entry(char*& datastring, int start_index, Reader& reader) {
98  // Read in one embl sub-entry in comments lines.
99  // If it's not a RDP defined comment, you should not call this function.
100 
101  int index = Skip_white_space(reader.line(), start_index);
102  freedup(datastring, reader.line() + index);
103 
104  const int expectedIndent = RDP_CONTINUED_INDENT+RDP_SUBKEY_INDENT;
105 
106  for (++reader;
107  is_embl_comment(reader.line()) && count_spaces(reader.line() + 2) >= expectedIndent;
108  ++reader)
109  {
110  index = Skip_white_space(reader.line(), p_nonkey_start + expectedIndent);
111 
112  char temp[LINESIZE];
113  strcpy(temp, reader.line() + index);
114  skip_eolnl_and_append_spaced(datastring, temp);
115  }
116 }
117 
118 static void embl_comments(Embl& embl, Reader& reader) {
119  // Read in embl comment lines.
120 
121  for (; is_embl_comment(reader.line());) {
122  char key[TOKENSIZE];
123  int index = Skip_white_space(reader.line(), 5);
124  int offset = comment_subkey(reader.line() + index, key);
125  index = Skip_white_space(reader.line(), index + offset);
126 
127  RDP_comment_parser one_comment_entry = embl_one_comment_entry;
128  RDP_comments& comments = embl.comments;
129 
130  if (!parse_RDP_comment(comments, one_comment_entry, key, index, reader)) {
131  // other comments
132  Append(comments.others, reader.line() + 5);
133  ++reader;
134  }
135  }
136 }
137 
138 static void embl_skip_unidentified(const char *pattern, Reader& reader) {
139  // if there are (numb) blanks at the beginning of line,
140  // it is a continue line of the current command.
141 
142  for (++reader; reader.line(); ++reader) {
143  char key[TOKENSIZE];
144  embl_key_word(reader.line(), 0, key);
145  if (!str_equal(key, pattern)) break;
146  }
147 }
148 
150  char key[TOKENSIZE];
151  embl_key_word(reader.line(), 0, key);
153  parse_keyed_section(key);
154 }
155 
156 static void embl_origin(Seq& seq, Reader& reader) {
157  // Read in embl sequence data.
158  ca_assert(seq.is_empty());
159 
160  // read in whole sequence data
161  for (++reader;
162  reader.line() && !is_sequence_terminator(reader.line());
163  ++reader)
164  {
165  const char *line = reader.line();
166  for (int idx = 5; line[idx]; ++idx) {
167  char ch = line[idx];
168  if (ch == ' ' || ch == '\n') continue;
169  if (idx>70) continue;
170  seq.add(ch);
171  }
172  }
173 }
174 
175 void EmblParser::parse_keyed_section(const char *key) {
176  if (str_equal(key, "ID")) {
177  embl_one_entry(reader, embl.ID, key);
178  }
179  else if (str_equal(key, "DT")) {
180  embl_date(embl, reader);
181  }
182  else if (str_equal(key, "DE")) {
183  embl_one_entry(reader, embl.description, key);
184  }
185  else if (str_equal(key, "OS")) {
186  embl_one_entry(reader, embl.os, key);
187  }
188  else if (str_equal(key, "AC")) {
189  embl_one_entry(reader, embl.accession, key);
190  }
191  else if (str_equal(key, "KW")) {
192  embl_one_entry(reader, embl.keywords, key);
193 
194  // correct missing '.'
195  if (!has_content(embl.keywords)) freedup(embl.keywords, ".\n");
196  else terminate_with(embl.keywords, '.');
197  }
198  else if (str_equal(key, "DR")) {
199  embl_one_entry(reader, embl.dr, key);
200  }
201  else if (str_equal(key, "RA")) {
202  Emblref& ref = embl.get_latest_ref();
203  embl_one_entry(reader, ref.author, key);
204  terminate_with(ref.author, ';');
205  }
206  else if (str_equal(key, "RT")) {
207  Emblref& ref = embl.get_latest_ref();
208  embl_one_entry(reader, ref.title, key);
209  embl_correct_title(ref);
210  }
211  else if (str_equal(key, "RL")) {
212  Emblref& ref = embl.get_latest_ref();
213  embl_one_entry(reader, ref.journal, key);
214  terminate_with(ref.journal, '.');
215  }
216  else if (str_equal(key, "RP")) {
217  Emblref& ref = embl.get_latest_ref();
218  embl_one_entry(reader, ref.processing, key);
219  }
220  else if (str_equal(key, "RN")) {
221  embl.resize_refs(embl.get_refcount()+1);
222  ++reader;
223  }
224  else if (str_equal(key, "CC")) {
225  embl_comments(embl, reader);
226  }
227  else if (str_equal(key, "SQ")) {
230  }
231  else {
233  }
234 }
235 
236 void embl_key_word(const char *line, int index, char *key) {
237  parse_key_word(line+index, key, " \t\n");
238 }
239 
240 static void embl_print_lines(Writer& write, const char *key, const char *content, const WrapMode& wrapMode) {
241  // Print EMBL entry and wrap around if line over EMBLMAXLINE.
242  ca_assert(strlen(key) == 2);
243 
244  char prefix[TOKENSIZE];
245  sprintf(prefix, "%-*s", EMBLINDENT, key);
246 
247  wrapMode.print(write, prefix, prefix, content, EMBLMAXLINE);
248 }
249 
250 static bool embl_print_lines_if_content(Writer& write, const char *key, const char *content, const WrapMode& wrapMode, bool followed_by_spacer) {
251  if (has_content(content)) {
252  embl_print_lines(write, key, content, wrapMode);
253  if (followed_by_spacer) write.out("XX\n");
254  return true;
255  }
256  return false;
257 }
258 
259 static void embl_print_comment_if_content(Writer& write, const char *key, const char *content) {
260  // Print one embl comment line, wrap around
261 
262  if (!has_content(content)) return;
263 
264  char first[LINESIZE]; sprintf(first, "CC%*s%s", (EMBLINDENT-2)+RDP_SUBKEY_INDENT, "", key);
265  char other[LINESIZE]; sprintf(other, "CC%*s", (EMBLINDENT-2)+RDP_SUBKEY_INDENT+RDP_CONTINUED_INDENT, "");
266  WrapMode(true).print(write, first, other, content, EMBLMAXLINE);
267 }
268 
269 inline void embl_print_completeness(Writer& write, char compX, char X) {
270  if (compX == ' ') return;
271  ca_assert(compX == 'y' || compX == 'n');
272  write.outf("CC %c' end complete: %s\n", X, compX == 'y' ? "Yes" : "No");
273 }
274 
275 static void embl_out_comments(const Embl& embl, const Seq& seq, Writer& write) {
276  // Print out the comments part of EMBL format.
277 
278  const OrgInfo& orginf = embl.comments.orginf;
279  if (orginf.exists()) {
280  write.out("CC Organism information\n");
281 
282  embl_print_comment_if_content(write, "Source of strain: ", orginf.source);
283  embl_print_comment_if_content(write, "Culture collection: ", orginf.cultcoll);
284  embl_print_comment_if_content(write, "Former name: ", orginf.formname);
285  embl_print_comment_if_content(write, "Alternate name: ", orginf.nickname);
286  embl_print_comment_if_content(write, "Common name: ", orginf.commname);
287  embl_print_comment_if_content(write, "Host organism: ", orginf.hostorg);
288  }
289 
290  const SeqInfo& seqinf = embl.comments.seqinf;
291  if (seqinf.exists()) {
292  write.outf("CC Sequence information (bases 1 to %d)\n", seq.get_len());
293 
294  embl_print_comment_if_content(write, "RDP ID: ", seqinf.RDPid);
295  embl_print_comment_if_content(write, "Corresponding GenBank entry: ", seqinf.gbkentry);
296  embl_print_comment_if_content(write, "Sequencing methods: ", seqinf.methods);
297 
298  embl_print_completeness(write, seqinf.comp5, '5');
299  embl_print_completeness(write, seqinf.comp3, '3');
300  }
301 
302  embl_print_lines_if_content(write, "CC", embl.comments.others, WrapMode("\n"), true);
303 }
304 
305 static void embl_out_origin(const Seq& seq, Writer& write) {
306  // Print out the sequence data of EMBL format.
307  BaseCounts bases;
308  seq.count(bases);
309  write.outf("SQ Sequence %d BP; %d A; %d C; %d G; %d T; %d other;\n",
310  seq.get_len(), bases.a, bases.c, bases.g, bases.t, bases.other);
311 
312  seq.out(write, EMBL);
313 }
314 
315 void embl_out_header(const Embl& embl, const Seq& seq, Writer& write) {
316  WrapMode wrapWords(true);
317  WrapMode neverWrap(false);
318 
319  embl_print_lines_if_content(write, "ID", embl.ID, neverWrap, true);
320  embl_print_lines_if_content(write, "AC", embl.accession, wrapWords, true);
321 
322  {
323  bool dt1 = embl_print_lines_if_content(write, "DT", embl.dateu, neverWrap, false);
324  bool dt2 = embl_print_lines_if_content(write, "DT", embl.datec, neverWrap, false);
325  if (dt1 || dt2) write.out("XX\n");
326  }
327 
328  embl_print_lines_if_content(write, "DE", embl.description, wrapWords, true);
329  embl_print_lines_if_content(write, "KW", embl.keywords, WrapMode(";"), true);
330 
331  if (has_content(embl.os)) {
332  embl_print_lines(write, "OS", embl.os, wrapWords);
333  write.out("OC No information.\n");
334  write.out("XX\n");
335  }
336 
337  // GenbankRef
338  for (int indi = 0; indi < embl.get_refcount(); indi++) {
339  const Emblref& ref = embl.get_ref(indi);
340 
341  write.outf("RN [%d]\n", indi + 1);
342  embl_print_lines_if_content(write, "RP", ref.processing, neverWrap, false);
343  embl_print_lines_if_content(write, "RA", ref.author, WrapMode(","), false);
344 
345  if (has_content(ref.title)) embl_print_lines(write, "RT", ref.title, wrapWords);
346  else write.out("RT ;\n");
347 
348  embl_print_lines_if_content(write, "RL", ref.journal, wrapWords, false);
349  write.out("XX\n");
350  }
351 
352  if (has_content(embl.dr)) {
353  embl_print_lines(write, "DR", embl.dr, wrapWords);
354  write.out("XX\n");
355  }
356 
357  embl_out_comments(embl, seq, write);
358 }
359 
360 void embl_out(const Embl& embl, const Seq& seq, Writer& write) {
361  // Output EMBL data.
362  embl_out_header(embl, seq, write);
363  embl_out_origin(seq, write);
364 }
365 
366 static char *etog_author(char *Str) {
367  // Convert EMBL author format to Genbank author format.
368  int indi, indk, len, index;
369  char token[TOKENSIZE], *author;
370 
371  author = ARB_strdup("");
372  for (indi = index = 0, len = str0len(Str) - 1; indi < len; indi++, index++) {
373  if (Str[indi] == ',' || Str[indi] == ';') {
374  token[index--] = '\0';
375  if (has_content(author)) {
376  Append(author, (Str[indi] == ',') ? "," : " and");
377  }
378  // search backward to find the first blank and replace the blank by ','
379  for (indk = 0; index > 0 && indk == 0; index--)
380  if (token[index] == ' ') {
381  token[index] = ',';
382  indk = 1;
383  }
384  Append(author, token);
385  index = (-1);
386  }
387  else
388  token[index] = Str[indi];
389  }
390  Append(author, "\n");
391  return author;
392 }
393 static char *etog_journal(const char *eJournal) {
394  // Convert journal part from EMBL to GenBank format.
395  char *new_journal = NULp;
396  char token[TOKENSIZE];
397 
398  scan_token_or_die(token, eJournal);
399  if (str_equal(token, "(in)") == 1 || str_equal(token, "Submitted") || str_equal(token, "Unpublished")) {
400  // remove trailing '.'
401  int len = strlen(eJournal);
402  ca_assert(eJournal[len-2] == '.');
403  new_journal = strndup(eJournal, len-2);
404  Append(new_journal, "\n");
405  }
406  else {
407  const char *colon = strchr(eJournal, ':');
408 
409  if (colon) {
410  const char *p1 = strchr(colon+1, '(');
411  if (p1) {
412  const char *p2 = strchr(p1+1, ')');
413  if (p2 && strcmp(p2+1, ".\n") == 0) {
414  ARB_realloc(new_journal, str0len(eJournal)+1+1);
415 
416  int l1 = colon-eJournal;
417  int l2 = p1-colon-1;
418  int l3 = p2-p1+1;
419 
420  char *pos = new_journal;
421 
422  memcpy(pos, eJournal, l1); pos += l1;
423  memcpy(pos, ", ", 2); pos += 2;
424  memcpy(pos, colon+1, l2); pos += l2;
425  memcpy(pos, " ", 1); pos += 1;
426  memcpy(pos, p1, l3); pos += l3;
427  memcpy(pos, "\n", 2);
428  }
429  }
430  }
431 
432  if (!new_journal) {
433  warningf(148, "Removed unknown journal format: %s", eJournal);
434  new_journal = no_content();
435  }
436  }
437 
438  return new_journal;
439 }
440 static void etog_convert_references(const Embl& embl, GenBank& gbk) {
441  // Convert reference from EMBL to GenBank format.
442  int indi, len, start, end;
443  char temp[LONGTEXT];
444 
445  gbk.resize_refs(embl.get_refcount());
446 
447  for (indi = 0; indi < embl.get_refcount(); indi++) {
448  const Emblref& ref = embl.get_ref(indi);
449  GenbankRef& gref = gbk.get_ref(indi);
450 
451  if (has_content(ref.processing) &&
452  sscanf(ref.processing, "%d %d", &start, &end) == 2)
453  {
454  end *= -1; // will get negative from sscanf
455  sprintf(temp, "%d (bases %d to %d)\n", (indi + 1), start, end);
456  }
457  else {
458  sprintf(temp, "%d\n", (indi + 1));
459  }
460 
461  freedup(gref.ref, temp);
462 
463  if (has_content(ref.title) && ref.title[0] != ';') {
464  // remove '"' and ';', if there is any
465  len = str0len(ref.title);
466  if (len > 2 && ref.title[0] == '"' && ref.title[len - 2] == ';' && ref.title[len - 3] == '"') {
467  ref.title[len - 3] = '\n';
468  ref.title[len - 2] = '\0';
469  freedup(gref.title, ref.title+1);
470  ref.title[len - 3] = '"';
471  ref.title[len - 2] = ';';
472  }
473  else {
474  freedup(gref.title, ref.title);
475  }
476  }
477  else {
478  freeset(gref.title, no_content());
479  }
480 
481  freeset(gref.author, has_content(ref.author) ? etog_author(ref.author) : no_content());
482  freeset(gref.journal, has_content(ref.journal) ? etog_journal(ref.journal) : no_content());
483 
484  freeset(gref.standard, no_content());
485  }
486 }
487 
488 int etog(const Embl& embl, GenBank& gbk, const Seq& seq) { // __ATTR__USERESULT
489  // Convert from embl to genbank format.
490  int indi;
491  char key[TOKENSIZE], temp[LONGTEXT];
492  char t1[TOKENSIZE], t2[TOKENSIZE], t3[TOKENSIZE];
493 
494  embl_key_word(embl.ID, 0, key);
495  if (has_content(embl.dr)) {
496  // get short_id from DR line if there is RDP def.
497  strcpy(t3, "dummy");
498  ASSERT_RESULT(int, 3, sscanf(embl.dr, "%s %s %s", t1, t2, t3));
499  if (str_equal(t1, "RDP;")) {
500  if (!str_equal(t3, "dummy")) {
501  strcpy(key, t3);
502  }
503  else
504  strcpy(key, t2);
505  key[str0len(key) - 1] = '\0'; // remove '.'
506  }
507  }
508  strcpy(temp, key);
509 
510  // LOCUS
511  for (indi = str0len(temp); indi < 13; temp[indi++] = ' ') {}
512  {
513  const char *date = has_content(embl.dateu) ? embl.dateu : today_date();
514  sprintf((temp + 10), "%7d bp RNA RNA %s\n",
515  seq.get_len(),
516  genbank_date(date));
517  }
518  freedup(gbk.locus, temp);
519 
520  // DEFINITION
521  if (copy_content(gbk.definition, embl.description)) terminate_with(gbk.definition, '.');
522 
523  // SOURCE and DEFINITION if not yet defined
524  if (copy_content(gbk.source, embl.os)) {
525  freedup(gbk.organism, embl.os);
526  if (!has_content(embl.description)) {
527  freedup(gbk.definition, embl.os);
528  }
529  }
530 
531  // COMMENT GenBank entry
532  copy_content(gbk.accession, embl.accession);
533  if (has_content(embl.keywords) && embl.keywords[0] != '.') {
534  freedup(gbk.keywords, embl.keywords);
535  }
536 
537  etog_convert_references(embl, gbk);
538  gbk.comments.set_content_from(embl.comments);
539 
540  return 1;
541 }
542 
543 int etom(const Embl& embl, Macke& macke, const Seq& seq) { // __ATTR__USERESULT
544  // Convert from embl format to Macke format.
545  GenBank gbk;
546  return etog(embl, gbk, seq) && gtom(gbk, macke);
547 }
548 
549 // --------------------------------------------------------------------------------
550 
551 #ifdef UNIT_TESTS
552 #include <test_unit.h>
553 
554 #define TEST_EXPECT_ETOG_JOURNAL_PARSES(i,o) \
555  do { \
556  char *dup = ARB_strdup(i); \
557  char *res = etog_journal(dup); \
558  TEST_EXPECT_EQUAL(res, o); \
559  free(res); \
560  free(dup); \
561  } while (0)
562 
563 void TEST_BASIC_etog_journal() {
564  // behavior documented in r6943:
565  TEST_EXPECT_ETOG_JOURNAL_PARSES("Gene 134:283-287(1993).\n",
566  "Gene 134, 283-287 (1993)\n");
567  TEST_EXPECT_ETOG_JOURNAL_PARSES("J. Exp. Med. 179:1809-1821(1994).\n",
568  "J. Exp. Med. 179, 1809-1821 (1994)\n");
569  TEST_EXPECT_ETOG_JOURNAL_PARSES("Unpublished whatever.\n",
570  "Unpublished whatever\n");
571  TEST_EXPECT_ETOG_JOURNAL_PARSES("bla bla bla.\n",
572  "\n"); // skips if can't parse
573  TEST_EXPECT_ETOG_JOURNAL_PARSES("bla bla bla\n",
574  "\n");
575 }
576 
577 #endif // UNIT_TESTS
578 
579 static char *gtoe_author(char *author) {
580  // Convert GenBank author to EMBL author.
581  int indi, len, index, odd;
582  char *auth, *Str;
583 
584  // replace " and " by ", "
585  auth = nulldup(author);
586  if ((index = find_pattern(auth, " and ")) > 0) {
587  auth[index] = '\0';
588  Str = nulldup(auth);
589  auth[index] = ' '; // remove '\0' for free space later
590  Append(Str, ",");
591  Append(Str, auth + index + 4);
592  }
593  else
594  Str = nulldup(author);
595 
596  for (indi = 0, len = str0len(Str), odd = 1; indi < len; indi++) {
597  if (Str[indi] == ',') {
598  if (odd) {
599  Str[indi] = ' ';
600  odd = 0;
601  }
602  else {
603  odd = 1;
604  }
605  }
606  }
607 
608  freenull(auth);
609  return Str;
610 }
611 static char *gtoe_journal(char *Str) {
612  // Convert GenBank journal to EMBL journal.
613  char token[TOKENSIZE], *journal;
614  int indi, indj, index, len;
615 
616  if (scan_token(token, Str)) {
617  if (str_equal(token, "(in)") == 1 || str_equal(token, "Unpublished") || str_equal(token, "Submitted")) {
618  journal = nulldup(Str);
619  terminate_with(journal, '.');
620  return journal;
621  }
622  }
623 
624  journal = nulldup(Str);
625  for (indi = indj = index = 0, len = str0len(journal); indi < len; indi++, indj++) {
626  if (journal[indi] == ',') {
627  journal[indi] = ':';
628  indi++; // skip blank after ','
629  index = 1;
630  }
631  else if (journal[indi] == ' ' && index) {
632  indj--;
633  }
634  else
635  journal[indj] = journal[indi];
636  }
637 
638  journal[indj] = '\0';
639  terminate_with(journal, '.');
640  return journal;
641 }
642 static void gtoe_reference(const GenBank& gbk, Embl& embl) {
643  // Convert references from GenBank to EMBL.
644  if (gbk.has_refs()) {
645  embl.resize_refs(gbk.get_refcount());
646  }
647 
648  for (int indi = 0; indi < gbk.get_refcount(); indi++) {
649  Emblref& ref = embl.get_ref(indi);
650  const GenbankRef& gref = gbk.get_ref(indi);
651 
652  freedup(ref.title, gref.title);
653  embl_correct_title(ref);
654 
655  freeset(ref.journal, gtoe_journal(gref.journal));
656  terminate_with(ref.journal, '.');
657 
658  freeset(ref.author, gtoe_author(gref.author));
659  terminate_with(ref.author, ';');
660 
661  // create processing information
662  int refnum, start = 0, end = 0;
663  char t1[TOKENSIZE], t2[TOKENSIZE], t3[TOKENSIZE];
664 
665  if (!gref.ref || sscanf(gref.ref, "%d %s %d %s %d %s", &refnum, t1, &start, t2, &end, t3) != 6) {
666  start = 0;
667  end = 0;
668  }
669 
670  freenull(ref.processing);
671  if (start || end) ref.processing = strf("%d-%d\n", start, end);
672  else ref.processing = no_content();
673 
674  }
675 }
676 
677 int gtoe(const GenBank& gbk, Embl& embl, const Seq& seq) { // __ATTR__USERESULT
678  // Genbank to EMBL.
679  {
680  char temp[LONGTEXT];
681  strcpy(temp, gbk.get_id());
682 
683  upcase(temp); // Adjust short-id, EMBL short_id always upper case
684  for (int indi = min(str0len(temp), 9); indi < 10; indi++)
685  temp[indi] = ' ';
686 
687  sprintf(temp + 10, "preliminary; RNA; UNA; %d BP.\n", seq.get_len());
688  freedup(embl.ID, temp);
689  }
690 
691  // accession number
692  if (has_content(gbk.accession))
693  // take just the accession num, no version num.
694  freedup(embl.accession, gbk.accession);
695 
696  // date
697  {
698  char *date = gbk.get_date();
699 
700  freeset(embl.dateu, strf("%s (Rel. 1, Last updated, Version 1)\n", date));
701  freeset(embl.datec, strf("%s (Rel. 1, Created)\n", date));
702 
703  free(date);
704  }
705 
706  // description
707  copy_content(embl.description, gbk.definition);
708  // EMBL KW line
709  if (copy_content(embl.keywords, gbk.keywords)) {
710  terminate_with(embl.keywords, '.');
711  }
712  else {
713  freedup(embl.keywords, ".\n");
714  }
715 
716  copy_content(embl.os, gbk.organism); // EMBL OS line
717  // reference
718  gtoe_reference(gbk, embl);
719 
720  // EMBL DR line
721  {
722  char token[TOKENSIZE];
723  char temp[LONGTEXT];
724 
725  scan_token_or_die(token, gbk.locus); // short_id
726  if (has_content(gbk.comments.seqinf.RDPid)) {
727  char rdpid[TOKENSIZE];
728  scan_token_or_die(rdpid, gbk.comments.seqinf.RDPid);
729  sprintf(temp, "RDP; %s; %s.\n", rdpid, token);
730  }
731  else {
732  sprintf(temp, "RDP; %s.\n", token);
733  }
734  freedup(embl.dr, temp);
735  }
736  embl.comments.set_content_from(gbk.comments);
737 
738  return 1;
739 }
740 
741 static int partial_mtoe(const Macke& macke, Embl& embl) {
742  // Handle subspecies information when converting from Macke to EMBL.
743  char*& others = embl.comments.others;
744 
745  if (has_content(macke.strain)) {
746  int ridx = skip_pattern(others, "*source:");
747  bool have_strain = ridx >= 0 && stristr(others+ridx, "strain=");
748 
749  if (!have_strain) {
750  if (!has_content(others)) freenull(others);
751  Append(others, "*source: strain=");
752  Append(others, macke.strain);
753  if (!is_end_mark(others[str0len(others) - 2])) skip_eolnl_and_append(others, ";\n");
754  }
755  }
756 
757  if (has_content(macke.subspecies)) {
758  int ridx = skip_pattern(others, "*source:");
759  bool have_subsp = ridx >= 0 && find_subspecies(others+ridx, '=') >= 0;
760 
761  if (!have_subsp) {
762  if (!has_content(others)) freenull(others);
763  Append(others, "*source: subspecies=");
764  Append(others, macke.subspecies);
765  if (!is_end_mark(others[str0len(others) - 2])) skip_eolnl_and_append(others, ";\n");
766  }
767  }
768 
769  return 1;
770 }
771 
772 int mtoe(const Macke& macke, Embl& embl, const Seq& seq) { // __ATTR__USERESULT
773  GenBank gbk;
774  return mtog(macke, gbk, seq) && gtoe(gbk, embl, seq) && partial_mtoe(macke, embl);
775 }
776 
778  data.reinit();
779  if (!EmblParser(data, seq, *this).parse_entry()) abort();
780  return ok();
781 }
CONSTEXPR_INLINE int str0len(const char *str)
Definition: global.h:98
Definition: reader.h:21
int mtog(const Macke &macke, GenBank &gbk, const Seq &seq)
Definition: mg.cxx:414
char * nickname
Definition: rdp_info.h:18
#define p_nonkey_start
Definition: defs.h:33
int c
Definition: seq.h:22
char * formname
Definition: rdp_info.h:17
static void embl_origin(Seq &seq, Reader &reader)
Definition: embl.cxx:156
int gtom(const GenBank &gbk, Macke &macke)
Definition: mg.cxx:454
char * title
Definition: genbank.h:17
char * others
Definition: rdp_info.h:99
char * gbkentry
Definition: rdp_info.h:64
int t
Definition: seq.h:24
char * title
Definition: embl.h:16
static void etog_convert_references(const Embl &embl, GenBank &gbk)
Definition: embl.cxx:440
char * commname
Definition: rdp_info.h:19
void embl_out(const Embl &embl, const Seq &seq, Writer &write)
Definition: embl.cxx:360
void warningf(int warning_num, const char *warning_messagef,...) __ATTR__FORMAT(2)
Definition: util.cxx:66
void add(char c)
Definition: seq.h:98
static void embl_one_entry(Reader &reader, char *&entry, const char *key)
Definition: embl.cxx:26
const char * stristr(const char *str, const char *substring)
Definition: util.cxx:250
void abort()
Definition: reader.h:54
void(* RDP_comment_parser)(char *&datastring, int start_index, Reader &reader)
Definition: fun.h:49
int find_pattern(const char *text, const char *pattern)
Definition: util.cxx:241
#define ca_assert(cond)
Definition: global.h:33
void embl_print_completeness(Writer &write, char compX, char X)
Definition: embl.cxx:269
void skip_eolnl_and_append(char *&string1, const char *string2)
Definition: util.cxx:127
#define ASSERT_RESULT(Type, Expected, Expr)
Definition: arb_assert.h:336
char * ARB_strdup(const char *str)
Definition: arb_string.h:27
char * source
Definition: rdp_info.h:15
char * ref
Definition: genbank.h:15
EntryState state
Definition: parser.h:18
void warning(int warning_num, const char *warning_message)
Definition: util.cxx:61
char * strf(const char *format,...) __ATTR__FORMAT(1)
Definition: util.cxx:27
static void embl_print_comment_if_content(Writer &write, const char *key, const char *content)
Definition: embl.cxx:259
int mtoe(const Macke &macke, Embl &embl, const Seq &seq)
Definition: embl.cxx:772
Definition: wrap.h:4
static void embl_out_origin(const Seq &seq, Writer &write)
Definition: embl.cxx:305
bool read_one_entry(Seq &seq) OVERRIDE __ATTR__USERESULT
Definition: embl.cxx:777
void count(BaseCounts &counter) const
Definition: seq.h:115
char * author
Definition: embl.h:15
Definition: reader.h:95
static char * gtoe_journal(char *Str)
Definition: embl.cxx:611
static char * gtoe_author(char *author)
Definition: embl.cxx:579
const char * genbank_date(const char *other_date)
Definition: date.cxx:164
int etom(const Embl &embl, Macke &macke, const Seq &seq)
Definition: embl.cxx:543
FILE * seq
Definition: rns.c:46
CONSTEXPR_INLINE bool has_content(const char *field)
Definition: global.h:127
static HelixNrInfo * start
int comment_subkey(const char *line, char *key)
Definition: embl.cxx:83
static void gtoe_reference(const GenBank &gbk, Embl &embl)
Definition: embl.cxx:642
char * journal
Definition: genbank.h:18
static void embl_out_comments(const Embl &embl, const Seq &seq, Writer &write)
Definition: embl.cxx:275
void print(Writer &write, const char *first_prefix, const char *other_prefix, const char *content, int max_width) const
Definition: wrap.cxx:52
char comp5
Definition: rdp_info.h:61
bool copy_content(char *&entry, const char *content)
Definition: global.h:136
Seq & seq
Definition: parser.h:19
bool ok() const
Definition: reader.h:52
int find_subspecies(const char *str, char expect_behind)
Definition: util.cxx:244
static void embl_comments(Embl &embl, Reader &reader)
Definition: embl.cxx:118
static char * etog_author(char *Str)
Definition: embl.cxx:366
Definition: fun.h:12
Definition: seq.h:43
virtual void out(char ch)=0
static void embl_skip_unidentified(const char *pattern, Reader &reader)
Definition: embl.cxx:138
int other
Definition: seq.h:25
void skip_eolnl_and_append_spaced(char *&string1, const char *string2)
Definition: util.cxx:135
CONSTEXPR_INLINE bool is_end_mark(char ch)
Definition: global.h:113
virtual int outf(const char *format,...) __ATTR__FORMAT_MEMBER(1)
Definition: reader.cxx:121
bool is_embl_comment(const char *line)
Definition: embl.cxx:95
int gtoe(const GenBank &gbk, Embl &embl, const Seq &seq)
Definition: embl.cxx:677
static bool embl_print_lines_if_content(Writer &write, const char *key, const char *content, const WrapMode &wrapMode, bool followed_by_spacer)
Definition: embl.cxx:250
Definition: embl.h:14
char comp3
Definition: rdp_info.h:60
int a
Definition: seq.h:21
Reader & reader
Definition: parser.h:20
void embl_key_word(const char *line, int index, char *key)
Definition: embl.cxx:236
void scan_token_or_die(char *to, const char *from)
Definition: util.cxx:14
int Skip_white_space(const char *line, int index)
Definition: util.cxx:84
int get_len() const
Definition: seq.h:107
bool is_empty() const
Definition: seq.h:108
char * journal
Definition: embl.h:17
char * standard
Definition: genbank.h:19
void embl_out_header(const Embl &embl, const Seq &seq, Writer &write)
Definition: embl.cxx:315
static int partial_mtoe(const Macke &macke, Embl &embl)
Definition: embl.cxx:741
Definition: seq.h:20
#define EMBLINDENT
Definition: defs.h:26
char * hostorg
Definition: rdp_info.h:20
#define EMBLMAXLINE
Definition: defs.h:21
char * RDPid
Definition: rdp_info.h:63
void parse_section() OVERRIDE
Definition: embl.cxx:149
static void embl_correct_title(Emblref &ref)
Definition: embl.cxx:59
const char * today_date()
Definition: date.cxx:214
void Append(char *&string1, const char *string2)
Definition: util.cxx:141
static void embl_date(Embl &embl, Reader &reader)
Definition: embl.cxx:33
void ARB_realloc(TYPE *&tgt, size_t nelem)
Definition: arb_mem.h:43
void upcase(char *str)
Definition: util.cxx:149
bool scan_token(char *to, const char *from) __ATTR__USERESULT
Definition: util.cxx:10
#define LONGTEXT
Definition: defs.h:17
CONSTEXPR_INLINE bool str_equal(const char *s1, const char *s2)
Definition: global.h:95
bool exists() const
Definition: rdp_info.h:48
bool parse_RDP_comment(RDP_comments &comments, RDP_comment_parser one_comment_entry, const char *key, int index, Reader &reader)
Definition: rdp_info.cxx:12
#define RDP_CONTINUED_INDENT
Definition: defs.h:31
char * processing
Definition: embl.h:18
static void embl_one_comment_entry(char *&datastring, int start_index, Reader &reader)
Definition: embl.cxx:97
static int pattern[maxsites+1]
static char * etog_journal(const char *eJournal)
Definition: embl.cxx:393
#define TOKENSIZE
Definition: defs.h:18
char * author
Definition: genbank.h:16
const char * line() const
Definition: reader.h:43
static int line
Definition: arb_a2ps.c:296
#define NULp
Definition: cxxforward.h:116
CONSTEXPR_INLINE int count_spaces(const char *str)
Definition: global.h:109
#define RDP_SUBKEY_INDENT
Definition: defs.h:30
int parse_key_word(const char *line, char *key, const char *separator)
Definition: util.cxx:265
char * cultcoll
Definition: rdp_info.h:16
char * no_content()
Definition: global.h:129
#define offset(field)
Definition: GLwDrawA.c:73
char * methods
Definition: rdp_info.h:65
int g
Definition: seq.h:23
char * strndup(const char *str, int len)
Definition: global.h:102
int skip_pattern(const char *text, const char *pattern)
Definition: util.cxx:242
CONSTEXPR_INLINE bool is_sequence_terminator(const char *str)
Definition: global.h:115
bool exists() const
Definition: rdp_info.h:86
void out(Writer &write, Format outType) const
Definition: seq.cxx:12
#define LINESIZE
Definition: defs.h:16
#define min(a, b)
Definition: f2c.h:153
void terminate_with(char *&str, char ch)
Definition: util.cxx:110
static void embl_continue_line(const char *pattern, char *&Str, Reader &reader)
Definition: embl.cxx:6
int etog(const Embl &embl, GenBank &gbk, const Seq &seq)
Definition: embl.cxx:488
static void embl_print_lines(Writer &write, const char *key, const char *content, const WrapMode &wrapMode)
Definition: embl.cxx:240