ARB
Importer.cxx
Go to the documentation of this file.
1 // ================================================================ //
2 // //
3 // File : Importer.cxx //
4 // Purpose : Genome importer core //
5 // //
6 // Coded by Ralf Westram (coder@reallysoft.de) in November 2006 //
7 // Institute of Microbiology (Technical University Munich) //
8 // http://www.arb-home.de/ //
9 // //
10 // ================================================================ //
11 
12 #include "tools.h"
13 #include "DBwriter.h"
14 #include <arbdb.h>
15 #include <arb_stdstr.h>
16 
17 using namespace std;
18 
19 // --------------------------------------------------------------------------------
20 
21 static bool is_escaped(const string& str, size_t pos) {
22  // returns true, if position 'pos' in string 'str' is escaped by '\\'
23 
24  bool escaped = false;
25  if (pos != 0) { // pos 0 can't be escaped
26  if (str[pos-1] == '\\') { // is an escape before pos ?
27  escaped = !is_escaped(str, pos-1); // pos is escaped, if the escape isn't!
28  }
29  }
30  return escaped;
31 }
32 
34  // start parsing at position 5
35  string::size_type first_char = line.find_first_not_of(' ', 5);
36 
37  orgLine = line;
38 
39  if (first_char == 5) { // feature start
40  string::size_type behind_name = line.find_first_of(' ', first_char);
41  string::size_type rest_start = line.find_first_not_of(' ', behind_name);
42 
43  if (rest_start == string::npos) {
44  if (behind_name == string::npos) throw "Expected space behind feature name";
45  throw "Expected some content behind feature name";
46  }
47 
48  name = line.substr(first_char, behind_name-first_char);
49  rest = line.substr(rest_start);
50  type = FL_START;
51  }
52  else if (first_char >= 21) { // not feature start
53  if (first_char == 21 && line[first_char] == '/') { // qualifier start
54  string::size_type equal_pos = line.find_first_of('=', first_char);
55  if (equal_pos == string::npos) {
56  // qualifier w/o data (i.e. "/pseudo")
57  name = line.substr(first_char+1);
58  rest = "true";
60  }
61  else {
62  name = line.substr(first_char+1, equal_pos-first_char-1);
63  rest = line.substr(equal_pos+1);
64 
65  if (rest[0] == '"') {
66  size_t rlen = rest.length();
67 
68  if (rlen == 1) { // special case: only one open quote behind qualifier
70  }
71  else if (rest[rlen-1] == '"' && !is_escaped(rest, rlen-1)) { // closing non-escaped quote at eol
73  }
74  else {
76  }
77  }
78  else {
80  }
81  }
82  }
83  else { // continued line
84  interpret_as_continued_line();
85  }
86  }
87  else {
88  if (first_char == string::npos) {
89  throw "Expected feature line, found empty line";
90  }
91  throw GBS_global_string("Expected feature line (first char at pos=%zu unexpected)", first_char);
92  }
93 }
94 
95 void FeatureLine::interpret_as_continued_line() {
96  rest = orgLine.substr(21);
97  if (rest[rest.length()-1] == '"') {
99  }
100  else {
101  type = FL_CONTINUED;
102  }
103 }
104 
106  bool ok = false;
107 
108  if (type == FL_QUALIFIER || type == FL_QUALIFIER_NODATA) {
109  string::size_type first_char = orgLine.find_first_not_of(' ', 5);
110  if (first_char >= 21) {
111  interpret_as_continued_line();
112  ok = true;
113  }
114  }
115 
116  return ok;
117 }
118 
119 // --------------------------------------------------------------------------------
120 
121 Importer::Importer(LineReader& Flatfile, DBwriter& DB_writer, const MetaTag *meta_description)
122  : db_writer(DB_writer),
123  flatfile(Flatfile),
124  tagTranslator(meta_description),
125  expectedSeqLength(-1)
126 {}
127 
128 void Importer::warning(const char *msg) {
129  warnings.push_back(msg);
130 }
131 
133  FeatureLinePtr fline;
134 
135  if (pushedFeatureLines.empty()) { // nothing on stack -> read new
136  string line;
137  if (readFeatureTableLine(line)) fline = new FeatureLine(line);
138  }
139  else {
140  fline = pushedFeatureLines.back();
141  pushedFeatureLines.pop_back();
142  }
143  return fline;
144 }
145 
148  if (!fline.isNull()) {
149  if (fline->type & FL_META_CONTINUED) throw "Expected start of feature or qualifier";
150 
151  if (0 == (fline->type & (FL_QUALIFIER_NODATA|FL_QUALIFIER_QUOTED))) {
152  // qualifier/featurestart may be wrapped
153  FeatureLinePtr next_fline = getFeatureTableLine();
154 
155  while (!next_fline.isNull() &&
156  fline->type != FL_QUALIFIER_QUOTED) // already seen closing quote
157  {
158  if ((next_fline->type&FL_META_CONTINUED) == 0) {
159  // special case: a wrapped line of a quoted qualifier may start with /xxx
160  // (in that case it is misinterpreted as qualifier start)
161  if (fline->type == FL_QUALIFIER_QUOTE_OPENED) {
162  if (!next_fline->reinterpret_as_continued_line()) {
163  throw "did not see end of quoted qualifier (instead found next qualifiert)";
164  }
165  gi_assert(next_fline->type & FL_META_CONTINUED);
166  }
167  else {
168  break;
169  }
170  }
171 
172  if (next_fline->type == FL_CONTINUED_QUOTE_CLOSED) {
173  if (fline->type != FL_QUALIFIER_QUOTE_OPENED) throw "Unexpected closing quote";
174  fline->type = FL_QUALIFIER_QUOTED;
175  }
176  else {
177  gi_assert(next_fline->type == FL_CONTINUED);
178  gi_assert(fline->type == FL_START || fline->type == FL_QUALIFIER || fline->type == FL_QUALIFIER_QUOTE_OPENED);
179  }
180 
181  fline->rest.append(next_fline->rest);
182  next_fline = getFeatureTableLine();
183  }
184 
185  if (!next_fline.isNull()) backFeatureTableLine(next_fline);
186  }
187  }
188  return fline;
189 }
190 
192  FeaturePtr feature;
194 
195  if (!fline.isNull()) { // found a feature table line
196  if (fline->type != FL_START) throw "Expected feature start";
197 
198  feature = new Feature(fline->name, fline->rest);
199 
201  while (!fline.isNull() && (fline->type & FL_META_QUALIFIER)) {
202  feature->addQualifiedEntry(fline->name, fline->rest);
204  }
205  if (!fline.isNull()) backFeatureTableLine(fline);
206  }
207 
208  return feature;
209 }
210 
212  FeaturePtr feature = parseFeature();
213 
214  while (!feature.isNull()) {
215  feature->expectLocationInSequence(expectedSeqLength);
216  feature->fixEmptyQualifiers();
218  feature = parseFeature();
219  }
220 }
221 
222 void Importer::show_warnings(const string& import_of_what) {
223  if (!warnings.empty()) {
224  const char *what = import_of_what.c_str();
225  stringVectorCRIter e = warnings.rend();
226  for (stringVectorCRIter i = warnings.rbegin(); i != e; ++i) {
227  GB_warningf("Warning: %s: %s", what, i->c_str());
228  }
229  warnings.clear();
230  }
231 }
232 
233 
235  try {
236  string line;
237  while (flatfile.getLine(line)) {
238  if (!line.empty()) { // silently skip empty lines before or after section
239  flatfile.backLine(line);
240 
241  // cleanup from import of previous section
242  gi_assert(pushedFeatureLines.empty()); // oops - somehow forgot a feature
243  pushedFeatureLines.clear();
244  warnings.clear();
245 
246  expectedSeqLength = 0; // reset expected seq. length
247  import_section();
248 
249  gi_assert(warnings.empty());
250  gi_assert(pushedFeatureLines.empty()); // oops - somehow forgot a feature
251  }
252  }
253  }
254  catch (const DBerror& err) { throw err.getMessage(); }
255  catch (const string& err) { throw flatfile.lineError(err); }
256  catch (const char *err) { throw flatfile.lineError(err); }
257 }
258 
259 void Importer::check_base_counters(const SequenceBuffer& seqData, const BaseCounter *headerCount) {
260  const BaseCounter& baseCounter = seqData.getBaseCounter();
261  if (baseCounter.getCount(BC_ALL)<1) {
262  warning("Sequence data is empty (only metadata found).");
263  }
264  if (!headerCount) {
265  gi_assert(dynamic_cast<GenebankImporter*>(this) != NULp); // this case shall only happen with genebank files.
266  warning("No 'BASE COUNT' found. Base counts have not been validated.");
267  }
268  else {
269  headerCount->expectEqual(baseCounter);
270  }
271 }
272 
273 // --------------------------------------------------------------------------------
274 // Meta information definitions
275 //
276 //
277 // [ please keep the list of common entries in
278 // ../HELP_SOURCE/source/sp_info.hlp@"Commonly used database entries"
279 // up to date! ]
280 
282  { "LOCUS", "org_locus", MT_HEADER },
283 
284  { "REFERENCE", "", MT_REF_START },
285  { " AUTHORS", "author", MT_REF },
286  { " TITLE", "title", MT_REF },
287  { " CONSRTM", "refgrp", MT_REF },
288  { " JOURNAL", "journal", MT_REF },
289  { " PUBMED", "pubmed_id", MT_REF },
290  { " MEDLINE", "medline_id", MT_REF },
291  { " REMARK", "refremark", MT_REF },
292 
293  { "DEFINITION", "definition", MT_BASIC },
294  { "ACCESSION", "acc", MT_BASIC },
295  { "VERSION", "version", MT_BASIC },
296  { "DBLINK", "db_xref", MT_BASIC },
297  { "KEYWORDS", "keywd", MT_BASIC },
298  { "SOURCE", "full_name", MT_BASIC },
299  { " ORGANISM", "tax", MT_BASIC },
300  { "COMMENT", "comment", MT_BASIC },
301  { "PROJECT", "projref", MT_BASIC },
302 
303  { "FEATURES", "", MT_FEATURE_START },
304  { "CONTIG", "", MT_CONTIG },
305  { "BASE", "", MT_SEQUENCE_START }, // BASE COUNT (sometimes missing)
306  { "ORIGIN", "", MT_SEQUENCE_START }, // only used if BASE COUNT is missing
307  { "//", "", MT_END },
308 
309  { "", "", MT_IGNORE }, // End of array
310 };
311 
313  { "ID", "org_id", MT_HEADER },
314 
315  { "RN", "", MT_REF_START },
316  { "RA", "author", MT_REF },
317  { "RC", "auth_comm", MT_REF },
318  { "RG", "refgrp", MT_REF },
319  { "RL", "journal", MT_REF },
320  { "RP", "nuc_rp", MT_REF },
321  { "RT", "title", MT_REF },
322  { "RX", "", MT_REF_DBID }, // @@@ extract field 'pubmed_id' ?
323 
324  { "AC", "acc", MT_BASIC },
325  { "AH", "assembly_header", MT_BASIC },
326  { "AS", "assembly_info", MT_BASIC },
327  { "CC", "comment", MT_BASIC },
328  { "CO", "contig", MT_BASIC },
329  { "DE", "description", MT_BASIC },
330  { "DR", "db_xref", MT_BASIC },
331  { "DT", "date", MT_BASIC },
332  { "SV", "version", MT_BASIC },
333  { "KW", "keywd", MT_BASIC },
334  { "OS", "full_name", MT_BASIC },
335  { "OC", "tax", MT_BASIC },
336  { "OG", "organelle", MT_BASIC },
337  { "PR", "projref", MT_BASIC },
338 
339  { "FH", "", MT_FEATURE_START },
340  { "FT", "", MT_FEATURE },
341  { "SQ", "", MT_SEQUENCE_START },
342  { "//", "", MT_END },
343 
344  { "XX", "", MT_IGNORE }, // spacer
345 
346  { "", "", MT_IGNORE }, // End of array
347 };
348 
349 // --------------------------------------------------------------------------------
350 
351 
353  : Importer(Flatfile, DB_writer, genebank_meta_description)
354 {}
355 
356 bool GenebankImporter::readFeatureTableLine(string& line) {
357  if (flatfile.getLine(line)) {
358  if (beginsWith(line, " ")) {
359  return true;
360  }
361  flatfile.backLine(line);
362  }
363  return false;
364 }
365 
366 static bool splitGenebankTag(const string& line, string& tag, string& content) {
367  // split a line into tag (incl. preceding spaces) and content
368  // returns true, if line suffices the format requirements (currently never returns false!)
369  // Note: returns tag="" at wrapped lines
370 
371  string::size_type first_non_space = line.find_first_not_of(' ');
372 
373  if (first_non_space >= 12 || // no tag, only content
374  (first_non_space == string::npos && line.length() == 12)) { // same with empty content
375  tag = "";
376  content = line.substr(12);
377  return true;
378  }
379 
380  gi_assert(first_non_space<12);
381 
382  string::size_type behind_tag = line.find_first_of(' ', first_non_space);
383  if (behind_tag == string::npos) { // only tag w/o spaces behind
384  tag = line;
385  content = "";
386  return true;
387  }
388 
389  string::size_type content_start = line.find_first_not_of(' ', behind_tag);
390  if (content_start == string::npos) { // line w/o content
391  content = "";
392  }
393  else {
394  content = line.substr(content_start);
395  }
396 
397  tag = line.substr(0, behind_tag);
398  return true;
399 }
400 
401 static long scanSeqlenFromLOCUS(const string& locusContent) {
402  StringParser parser(locusContent);
403  parser.extractWord(); // id
404  parser.eatSpaces();
405 
406  long bp = parser.extractNumber();
407  parser.eatSpaces();
408  parser.expectContent("bp");
409 
410  return bp;
411 }
412 
413 void GenebankImporter::import_section() {
414  MetaInfo meta;
415  References refs;
416 
417  const MetaTag *prevTag = NULp; // previously handled tag
418  string prevContent; // previously found content
419 
420  bool seenHeaderLine = false;
421  bool EOS = false; // end of section ?
422 
423  // read header of file
424  while (!EOS) {
425  string line, tag, content;
426  expectLine(line);
427  if (!splitGenebankTag(line, tag, content)) {
428  gi_assert(0);
429  }
430 
431  if (tag.empty()) { // no tag - happens at wrapped lines
432  if (!content.empty()) { // do not append empty 'lines' from wrapped tag-entries.
433  prevContent.append(1, ' ');
434  prevContent.append(content);
435  }
436  }
437  else { // start of new tag
438  const MetaTag *knownTag = findTag(tag);
439  if (!knownTag) throw GBS_global_string("Invalid tag '%s'", tag.c_str());
440 
441  if (prevTag) { // save previous tag
442  switch (prevTag->type) {
443  case MT_REF: refs.add(prevTag->field, prevContent); break;
444  case MT_BASIC: meta.add(prevTag, prevContent, true); break;
445  case MT_HEADER:
446  meta.add(prevTag, prevContent, true); // save header line
447  expectedSeqLength = scanSeqlenFromLOCUS(prevContent);
448  break;
449  case MT_REF_DBID: // embl only
450  default: gi_assert(0); break;
451  }
452  prevTag = NULp;
453  }
454 
455  switch (knownTag->type) {
456  case MT_HEADER:
457  if (seenHeaderLine) throw GBS_global_string("Multiple occurrences of tag '%s'", tag.c_str());
458  seenHeaderLine = true;
459  // fall-through
460  case MT_BASIC:
461  case MT_REF:
462  prevTag = knownTag;
463  prevContent = content;
464  break;
465 
466  case MT_REF_START:
467  refs.start(); // start a new reference
468  break;
469 
470  case MT_FEATURE_START:
473  break;
474 
475  case MT_SEQUENCE_START:
476  parseSequence(knownTag->tag, content);
477  EOS = true; // end of section
478  break;
479 
480  case MT_IGNORE:
481  break;
482 
483  case MT_END:
484  EOS = true;
485  break;
486 
487  case MT_CONTIG:
488  throw GBS_global_string("Cannot import files containing CONTIG");
489 
490  case MT_REF_DBID: // embl only
491  default:
492  gi_assert(0);
493  throw GBS_global_string("Tag '%s' not expected here", knownTag->tag.c_str());
494  }
495  }
496  }
497 
498  db_writer.finalizeOrganism(meta, refs, *this);
500 }
501 
502 // --------------------------------------------------------------------------------
503 
504 
506  : Importer(Flatfile, DB_writer, embl_meta_description)
507 {}
508 
509 static bool splitEmblTag(const string& line, string& tag, string& content) {
510  // split a line into 2-character tag and content
511  // return true on success (i.e. if line suffices the required format)
512 
513  if (line.length() == 2) {
514  tag = line;
515  content = "";
516  }
517  else {
518  string::size_type spacer = line.find(" "); // separator between tag and content
519  if (spacer != 2) return false; // expect spacer at pos 2-4
520 
521  tag = line.substr(0, 2);
522  content = line.substr(5);
523  }
524 
525  return true;
526 }
527 
528 bool EmblImporter::readFeatureTableLine(string& line) {
529  if (flatfile.getLine(line)) {
530  if (beginsWith(line, "FT ")) {
531  return true;
532  }
533  flatfile.backLine(line);
534  }
535  return false;
536 }
537 
538 static long scanSeqlenFromID(const string& idContent) {
539  StringParser parser(idContent);
540  string lastWord = parser.extractWord(); // eat id
541  bool bpseen = false;
542  long bp = -1;
543 
544  while (!bpseen) {
545  parser.eatSpaces();
546  string word = parser.extractWord();
547  if (word == "BP.") {
548  // basecount is in word before "BP."
549  bp = atol(lastWord.c_str());
550  bpseen = true;
551  }
552  else {
553  lastWord = word;
554  }
555  }
556 
557  if (bp == -1) throw "Could not parse bp from header";
558 
559  return bp;
560 }
561 
562 void EmblImporter::import_section() {
563  MetaInfo meta;
564  References refs;
565 
566  const MetaTag *prevTag = NULp; // previously handled tag
567  string prevContent; // previously found content
568  bool prevAppendNL = false; // append '\n' into multiline tags
569 
570  bool seenHeaderLine = false;
571  bool EOS = false; // end of section ?
572 
573  // read header of file
574  while (!EOS) {
575  string line, tag, content;
576  expectLine(line);
577  if (!splitEmblTag(line, tag, content)) {
578  throw "Expected two-character tag at start of line";
579  }
580 
581  const MetaTag *knownTag = findTag(tag);
582  if (!knownTag) throw GBS_global_string("Invalid tag '%s'", tag.c_str());
583 
584  if (knownTag == prevTag) { // multiline tag
585  if (prevAppendNL) prevContent.append("\n"); // append a newline to make parsing in add_dbid() more easy
586  prevContent.append(content); // append w/o space - EMBL flatfiles have spaces at EOL when needed
587  }
588  else { // start of new tag
589  if (prevTag) { // save previous tag
590  switch (prevTag->type) {
591  case MT_REF: refs.add(prevTag->field, prevContent); break;
592  case MT_REF_DBID: refs.add_dbid(prevContent); prevAppendNL = false; break;
593  case MT_BASIC: meta.add(prevTag, prevContent, true); break;
594  case MT_HEADER:
595  meta.add(prevTag, prevContent, true);
596  expectedSeqLength = scanSeqlenFromID(prevContent);
597  break;
598  default: gi_assert(0); break;
599  }
600  prevTag = NULp;
601  }
602 
603  switch (knownTag->type) {
604  case MT_HEADER:
605  if (seenHeaderLine) throw GBS_global_string("Multiple occurrences of tag '%s'", tag.c_str());
606  seenHeaderLine = true;
607  // fall-through
608  case MT_BASIC:
609  case MT_REF:
610  prevTag = knownTag;
611  prevContent = content;
612  break;
613 
614  case MT_REF_DBID:
615  prevTag = knownTag;
616  prevContent = content;
617  prevAppendNL = true;
618  break;
619 
620  case MT_REF_START:
621  refs.start(); // start a new reference
622  break;
623 
624  case MT_FEATURE:
625  flatfile.backLine(line);
628  break;
629 
630  case MT_SEQUENCE_START:
631  parseSequence(content);
632  EOS = true; // end of section
633  break;
634 
635  case MT_FEATURE_START:
636  case MT_IGNORE:
637  break;
638 
639  default:
640  gi_assert(0);
641  throw GBS_global_string("Tag '%s' not expected here", knownTag->tag.c_str());
642  }
643  }
644  }
645  db_writer.finalizeOrganism(meta, refs, *this);
647 }
648 
649 // --------------------------------------------------------------------------------
650 // sequence readers:
651 
652 inline bool parseCounter(bool expect, BaseCounter& headerCount, StringParser& parser, Base base, const char *word) {
653  // parses part of string (e.g. " 6021225 BP;" or " 878196 A;")
654  // if 'expect' == true -> throw exception if missing
655  // if 'expect' == false -> return false if missing
656 
657  bool found = false;
658  stringCIter start = parser.getPosition();
659 
660  parser.expectSpaces(0);
661 
662  bool seen_number;
663  long count = parser.eatNumber(seen_number);
664 
665  if (seen_number) {
666  headerCount.addCount(base, count);
667  size_t spaces = parser.eatSpaces();
668  if (spaces>0) {
669  size_t len = parser.lookingAt(word);
670  if (len>0) { // seen
671  parser.advance(len);
672  found = true;
673  }
674  }
675  }
676 
677  if (!found) {
678  parser.setPosition(start); // reset position
679  if (expect) throw GBS_global_string("Expected counter '### %s', found '%s'", word, parser.rest().c_str());
680  }
681  return found;
682 }
683 
684 void GenebankImporter::parseSequence(const string& tag, const string& headerline) {
685  SmartPtr<BaseCounter> headerCount;
686 
687  if (tag == "BASE") { // base count not always present
688  // parse headerline :
689  headerCount = new BaseCounter("sequence header");
690  {
691  StringParser parser(headerline);
692 
693  parser.expectContent("COUNT");
694 
695  parseCounter(true, *headerCount, parser, BC_A, "a");
696  parseCounter(true, *headerCount, parser, BC_C, "c");
697  parseCounter(true, *headerCount, parser, BC_G, "g");
698  parseCounter(true, *headerCount, parser, BC_T, "t");
699  parseCounter(false, *headerCount, parser, BC_OTHER, "others"); // not always present
700 
701  headerCount->calcOverallCounter();
702  }
703  }
704 
705  // parse sequence data
706  size_t est_seq_size = headerCount.isNull() ? 500000 : headerCount->getCount(BC_ALL);
707  SequenceBuffer seqData(est_seq_size);
708  {
709  string line;
710 
711  if (!headerCount.isNull()) {
712  // if BASE COUNT was present, check ORIGIN line
713  // otherwise ORIGIN line has already been read
714  expectLine(line);
715  if (!beginsWith(line, "ORIGIN")) throw "Expected 'ORIGIN'";
716  }
717 
718  bool eos_seen = false;
719  while (!eos_seen) {
720  expectLine(line);
721  if (beginsWith(line, "//")) {
722  eos_seen = true;
723  }
724  else {
725  string data;
726  data.reserve(60);
727  StringParser parser(line);
728 
729  parser.eatSpaces(); // not sure whether there really have to be spaces if number has 9 digits or more
730  size_t cur_pos = (size_t)parser.extractNumber();
731  size_t datasize = seqData.getBaseCounter().getCount(BC_ALL);
732 
733  if (cur_pos != (datasize+1)) {
734  throw GBS_global_string("Got wrong base position (found=%zu, expected=%zu)", cur_pos, size_t(datasize+1));
735  }
736 
737  int blocks = 0;
738  while (!parser.atEnd() && parser.at() == ' ') {
739  parser.expectSpaces(1);
740 
741  stringCIter start = parser.pos;
742  stringCIter end = parser.find(' ');
743 
744  data.append(start, end);
745  blocks++;
746  }
747 
748  if (blocks>6) throw "Found more than 6 parts of sequence data";
749  seqData.addLine(data);
750  }
751  }
752  }
753 
754  check_base_counters(seqData, headerCount.content());
755  db_writer.writeSequence(seqData);
756 }
757 
758 void EmblImporter::parseSequence(const string& headerline) {
759  // parse headerline:
760  BaseCounter headerCount("sequence header");
761  {
762  StringParser parser(headerline);
763 
764  parser.expectContent("Sequence");
765 
766  parseCounter(true, headerCount, parser, BC_ALL, "BP;");
767  parseCounter(true, headerCount, parser, BC_A, "A;");
768  parseCounter(true, headerCount, parser, BC_C, "C;");
769  parseCounter(true, headerCount, parser, BC_G, "G;");
770  parseCounter(true, headerCount, parser, BC_T, "T;");
771  parseCounter(true, headerCount, parser, BC_OTHER, "other;");
772 
773  headerCount.checkOverallCounter();
774  }
775 
776  // parse sequence data
777  SequenceBuffer seqData(headerCount.getCount(BC_ALL));
778  {
779  bool eos_seen = false;
780  string line;
781 
782  while (!eos_seen) {
783  expectLine(line);
784  if (beginsWith(line, "//")) {
785  eos_seen = true;
786  }
787  else {
788  string data;
789  data.reserve(60);
790  StringParser parser(line);
791 
792  parser.expectSpaces(5, false);
793  int blocks = 0;
794  while (!parser.atEnd() && isalpha(parser.at())) {
795  stringCIter start = parser.pos;
796  stringCIter end = parser.find(' ');
797 
798  data.append(start, end);
799  blocks++;
800  parser.expectSpaces(1);
801  }
802 
803  if (blocks>6) throw "Found more than 6 parts of sequence data";
804 
805  size_t basecount = (size_t)parser.extractNumber();
806 
807  seqData.addLine(data);
808  size_t datasize = seqData.getBaseCounter().getCount(BC_ALL);
809 
810  if (basecount != datasize) {
811  throw GBS_global_string("Got wrong base counter(found=%zu, expected=%zu)", basecount, datasize);
812  }
813  }
814  }
815  }
816 
817  check_base_counters(seqData, &headerCount);
818  db_writer.writeSequence(seqData);
819 }
820 
void backFeatureTableLine(FeatureLinePtr &fline)
Definition: Importer.h:73
void calcOverallCounter()
void show_warnings(const string &import_of_what)
Definition: Importer.cxx:222
GB_TYPES type
void check_base_counters(const SequenceBuffer &seqData, const BaseCounter *headerCount)
Definition: Importer.cxx:259
void writeSequence(const SequenceBuffer &seqData)
Definition: DBwriter.cxx:198
FeatureLine(const string &line)
Definition: Importer.cxx:33
bool getLine(string &line)
void add(const MetaTag *meta, const std::string &content, bool allow_multiple_entries)
Definition: MetaInfo.cxx:173
void writeFeature(const Feature &feature, long seqLength)
Definition: DBwriter.cxx:149
#define gi_assert(cond)
Definition: defs.h:26
bool parseCounter(bool expect, BaseCounter &headerCount, StringParser &parser, Base base, const char *word)
Definition: Importer.cxx:652
static MetaTag embl_meta_description[]
Definition: Importer.cxx:312
void expectLine(string &line)
Definition: Importer.h:67
virtual bool readFeatureTableLine(string &line)=0
void add_dbid(const std::string &content)
Definition: MetaInfo.cxx:122
size_t lookingAt(const char *content)
Definition: tools.h:79
static bool splitGenebankTag(const string &line, string &tag, string &content)
Definition: Importer.cxx:366
GenebankImporter(LineReader &Flatfile, DBwriter &DB_writer)
Definition: Importer.cxx:352
const char * GBS_global_string(const char *templat,...)
Definition: arb_msg.cxx:203
STL namespace.
std::string tag
Definition: MetaTag.h:33
void expectEqual(const BaseCounter &other) const
size_t getCount(Base base) const
void expectContent(const char *content)
Definition: tools.h:93
Importer(LineReader &Flatfile, DBwriter &DB_writer, const MetaTag *meta_description)
Definition: Importer.cxx:121
bool isNull() const
test if SmartPtr is NULp
Definition: smartptr.h:248
void finalizeOrganism(const MetaInfo &meta, const References &refs, Importer &importer)
Definition: DBwriter.cxx:380
Definition: MetaTag.h:28
void warning(const char *msg)
Definition: Importer.cxx:128
static HelixNrInfo * start
size_t eatSpaces()
Definition: tools.h:58
void GB_warningf(const char *templat,...)
Definition: arb_msg.cxx:536
FeatureLinePtr getFeatureTableLine()
Definition: Importer.cxx:132
const std::string & getAccessionNumber() const
Definition: MetaInfo.cxx:186
virtual void import_section()=0
void backLine(const string &line)
Generic smart pointer.
Definition: smartptr.h:149
LineReader & flatfile
Definition: Importer.h:61
string lineError(const string &msg) const
const BaseCounter & getBaseCounter() const
stringVector warnings
Definition: Importer.h:64
EmblImporter(LineReader &Flatfile, DBwriter &DB_writer)
Definition: Importer.cxx:505
void createOrganism(const string &flatfile, const char *importerTag)
Definition: DBwriter.cxx:87
std::string rest() const
Definition: tools.h:48
void advance(size_t offset)
Definition: tools.h:46
long extractNumber()
Definition: tools.h:123
static long scanSeqlenFromLOCUS(const string &locusContent)
Definition: Importer.cxx:401
FeatureLines pushedFeatureLines
Definition: Importer.h:63
std::string extractWord(const char *delimiter=" ")
Definition: tools.h:99
static MetaTag genebank_meta_description[]
Definition: Importer.cxx:281
static long scanSeqlenFromID(const string &idContent)
Definition: Importer.cxx:538
FeatureLinePtr getUnwrappedFeatureTableLine()
Definition: Importer.cxx:146
const MetaTag * findTag(const string &tag)
Definition: Importer.h:68
const T * content() const
convert SmartPtr to plain old pointer (also works if isNull())
Definition: smartptr.h:263
std::string field
Definition: MetaTag.h:34
long expectedSeqLength
Definition: Importer.h:65
bool beginsWith(const std::string &str, const std::string &start)
Definition: arb_stdstr.h:22
void add(const std::string &field, const std::string &content)
Definition: MetaInfo.h:43
virtual const string & getFilename() const =0
static bool splitEmblTag(const string &line, string &tag, string &content)
Definition: Importer.cxx:509
FeaturePtr parseFeature()
Definition: Importer.cxx:191
void parseFeatureTable()
Definition: Importer.cxx:211
void setPosition(const stringCIter &position)
Definition: tools.h:45
size_t expectSpaces(size_t count=1, bool allowMore=true)
Definition: tools.h:67
static int line
Definition: arb_a2ps.c:296
#define NULp
Definition: cxxforward.h:116
const string & getMessage() const
Definition: DBwriter.h:47
DBwriter & db_writer
Definition: Importer.h:60
bool reinterpret_as_continued_line()
Definition: Importer.cxx:105
void addCount(Base base, size_t amount)
long eatNumber(bool &eaten)
Definition: tools.h:110
Base
stringCIter getPosition() const
Definition: tools.h:44
void start()
Definition: MetaInfo.cxx:46
void import()
Definition: Importer.cxx:234
Definition: MetaTag.h:22
static bool is_escaped(const string &str, size_t pos)
Definition: Importer.cxx:21
MetaTagType type
Definition: MetaTag.h:35