ARB
Importer.cxx
Go to the documentation of this file.
1 // ================================================================ //
2 // //
3 // File : Importer.cxx //
4 // Purpose : Genome importer core //
5 // //
6 // Coded by Ralf Westram (coder@reallysoft.de) in November 2006 //
7 // Institute of Microbiology (Technical University Munich) //
8 // http://www.arb-home.de/ //
9 // //
10 // ================================================================ //
11 
12 #include "tools.h"
13 #include "DBwriter.h"
14 #include <arbdb.h>
15 #include <arb_stdstr.h>
16 
17 using namespace std;
18 
19 // --------------------------------------------------------------------------------
20 
21 static bool is_escaped(const string& str, size_t pos) {
22  // returns true, if position 'pos' in string 'str' is escaped by '\\'
23 
24  bool escaped = false;
25  if (pos != 0) { // pos 0 can't be escaped
26  if (str[pos-1] == '\\') { // is an escape before pos ?
27  escaped = !is_escaped(str, pos-1); // pos is escaped, if the escape isn't!
28  }
29  }
30  return escaped;
31 }
32 
34  // start parsing at position 5
35  string::size_type first_char = line.find_first_not_of(' ', 5);
36 
37  orgLine = line;
38 
39  if (first_char == 5) { // feature start
40  string::size_type behind_name = line.find_first_of(' ', first_char);
41  string::size_type rest_start = line.find_first_not_of(' ', behind_name);
42 
43  if (rest_start == string::npos) {
44  if (behind_name == string::npos) throw "Expected space behind feature name";
45  throw "Expected some content behind feature name";
46  }
47 
48  name = line.substr(first_char, behind_name-first_char);
49  rest = line.substr(rest_start);
50  type = FL_START;
51  }
52  else if (first_char >= 21) { // not feature start
53  if (first_char == 21 && line[first_char] == '/') { // qualifier start
54  string::size_type equal_pos = line.find_first_of('=', first_char);
55  if (equal_pos == string::npos) {
56  // qualifier w/o data (i.e. "/pseudo")
57  name = line.substr(first_char+1);
58  rest = "true";
60  }
61  else {
62  name = line.substr(first_char+1, equal_pos-first_char-1);
63  rest = line.substr(equal_pos+1);
64 
65  if (rest[0] == '"') {
66  size_t rlen = rest.length();
67 
68  if (rlen == 1) { // special case: only one open quote behind qualifier
70  }
71  else if (rest[rlen-1] == '"' && !is_escaped(rest, rlen-1)) { // closing non-escaped quote at eol
73  }
74  else {
76  }
77  }
78  else {
80  }
81  }
82  }
83  else { // continued line
84  interpret_as_continued_line();
85  }
86  }
87  else {
88  if (first_char == string::npos) {
89  throw "Expected feature line, found empty line";
90  }
91  throw GBS_global_string("Expected feature line (first char at pos=%zu unexpected)", first_char);
92  }
93 }
94 
95 void FeatureLine::interpret_as_continued_line() {
96  rest = orgLine.substr(21);
97  if (rest[rest.length()-1] == '"') {
99  }
100  else {
101  type = FL_CONTINUED;
102  }
103 }
104 
106  bool ok = false;
107 
108  if (type == FL_QUALIFIER || type == FL_QUALIFIER_NODATA) {
109  string::size_type first_char = orgLine.find_first_not_of(' ', 5);
110  if (first_char >= 21) {
111  interpret_as_continued_line();
112  ok = true;
113  }
114  }
115 
116  return ok;
117 }
118 
119 // --------------------------------------------------------------------------------
120 
121 Importer::Importer(LineReader& Flatfile, DBwriter& DB_writer, const MetaTag *meta_description)
122  : db_writer(DB_writer),
123  flatfile(Flatfile),
124  tagTranslator(meta_description),
125  expectedSeqLength(-1)
126 {}
127 
128 void Importer::warning(const char *msg) {
129  warnings.push_back(msg);
130 }
131 
133  FeatureLinePtr fline;
134 
135  if (pushedFeatureLines.empty()) { // nothing on stack -> read new
136  string line;
137  if (readFeatureTableLine(line)) fline = new FeatureLine(line);
138  }
139  else {
140  fline = pushedFeatureLines.back();
141  pushedFeatureLines.pop_back();
142  }
143  return fline;
144 }
145 
148  if (!fline.isNull()) {
149  if (fline->type & FL_META_CONTINUED) throw "Expected start of feature or qualifier";
150 
151  if (0 == (fline->type & (FL_QUALIFIER_NODATA|FL_QUALIFIER_QUOTED))) {
152  // qualifier/featurestart may be wrapped
153  FeatureLinePtr next_fline = getFeatureTableLine();
154 
155  while (!next_fline.isNull() &&
156  fline->type != FL_QUALIFIER_QUOTED) // already seen closing quote
157  {
158  if ((next_fline->type&FL_META_CONTINUED) == 0) {
159  // special case: a wrapped line of a quoted qualifier may start with /xxx
160  // (in that case it is misinterpreted as qualifier start)
161  if (fline->type == FL_QUALIFIER_QUOTE_OPENED) {
162  if (!next_fline->reinterpret_as_continued_line()) {
163  throw "did not see end of quoted qualifier (instead found next qualifiert)";
164  }
165  gi_assert(next_fline->type & FL_META_CONTINUED);
166  }
167  else {
168  break;
169  }
170  }
171 
172  if (next_fline->type == FL_CONTINUED_QUOTE_CLOSED) {
173  if (fline->type != FL_QUALIFIER_QUOTE_OPENED) throw "Unexpected closing quote";
174  fline->type = FL_QUALIFIER_QUOTED;
175  }
176  else {
177  gi_assert(next_fline->type == FL_CONTINUED);
178  gi_assert(fline->type == FL_START || fline->type == FL_QUALIFIER || fline->type == FL_QUALIFIER_QUOTE_OPENED);
179  }
180 
181  fline->rest.append(next_fline->rest);
182  next_fline = getFeatureTableLine();
183  }
184 
185  if (!next_fline.isNull()) backFeatureTableLine(next_fline);
186  }
187  }
188  return fline;
189 }
190 
192  FeaturePtr feature;
194 
195  if (!fline.isNull()) { // found a feature table line
196  if (fline->type != FL_START) throw "Expected feature start";
197 
198  feature = new Feature(fline->name, fline->rest);
199 
201  while (!fline.isNull() && (fline->type & FL_META_QUALIFIER)) {
202  feature->addQualifiedEntry(fline->name, fline->rest);
204  }
205  if (!fline.isNull()) backFeatureTableLine(fline);
206  }
207 
208  return feature;
209 }
210 
212  FeaturePtr feature = parseFeature();
213 
214  while (!feature.isNull()) {
215  feature->expectLocationInSequence(expectedSeqLength);
216  feature->fixEmptyQualifiers();
218  feature = parseFeature();
219  }
220 }
221 
222 void Importer::show_warnings(const string& import_of_what) {
223  if (!warnings.empty()) {
224  const char *what = import_of_what.c_str();
225  stringVectorCRIter e = warnings.rend();
226  for (stringVectorCRIter i = warnings.rbegin(); i != e; ++i) {
227  GB_warningf("Warning: %s: %s", what, i->c_str());
228  }
229  warnings.clear();
230  }
231 }
232 
233 
235  try {
236  string line;
237  while (flatfile.getLine(line)) {
238  if (!line.empty()) { // silently skip empty lines before or after section
239  flatfile.backLine(line);
240 
241  // cleanup from import of previous section
242  gi_assert(pushedFeatureLines.empty()); // oops - somehow forgot a feature
243  pushedFeatureLines.clear();
244  warnings.clear();
245 
246  expectedSeqLength = 0; // reset expected seq. length
247  import_section();
248 
249  gi_assert(warnings.empty());
250  gi_assert(pushedFeatureLines.empty()); // oops - somehow forgot a feature
251  }
252  }
253  }
254  catch (const DBerror& err) { throw err.getMessage(); }
255  catch (const string& err) { throw flatfile.lineError(err); }
256  catch (const char *err) { throw flatfile.lineError(err); }
257 }
258 
259 // --------------------------------------------------------------------------------
260 // Meta information definitions
261 //
262 //
263 // [ please keep the list of common entries in
264 // ../HELP_SOURCE/oldhelp/sp_info.hlp
265 // up to date! ]
266 
268  { "LOCUS", "org_locus", MT_HEADER },
269 
270  { "REFERENCE", "", MT_REF_START },
271  { " AUTHORS", "author", MT_REF },
272  { " TITLE", "title", MT_REF },
273  { " CONSRTM", "refgrp", MT_REF },
274  { " JOURNAL", "journal", MT_REF },
275  { " PUBMED", "pubmed_id", MT_REF },
276  { " MEDLINE", "medline_id", MT_REF },
277  { " REMARK", "refremark", MT_REF },
278 
279  { "DEFINITION", "definition", MT_BASIC },
280  { "ACCESSION", "acc", MT_BASIC },
281  { "VERSION", "version", MT_BASIC },
282  { "KEYWORDS", "keywd", MT_BASIC },
283  { "SOURCE", "full_name", MT_BASIC },
284  { " ORGANISM", "tax", MT_BASIC },
285  { "COMMENT", "comment", MT_BASIC },
286  { "PROJECT", "projref", MT_BASIC },
287 
288  { "FEATURES", "", MT_FEATURE_START },
289  { "CONTIG", "", MT_CONTIG },
290  { "BASE", "", MT_SEQUENCE_START }, // BASE COUNT (sometimes missing)
291  { "ORIGIN", "", MT_SEQUENCE_START }, // only used if BASE COUNT is missing
292  { "//", "", MT_END },
293 
294  { "", "", MT_IGNORE }, // End of array
295 };
296 
298  { "ID", "org_id", MT_HEADER },
299 
300  { "RN", "", MT_REF_START },
301  { "RA", "author", MT_REF },
302  { "RC", "auth_comm", MT_REF },
303  { "RG", "refgrp", MT_REF },
304  { "RL", "journal", MT_REF },
305  { "RP", "nuc_rp", MT_REF },
306  { "RT", "title", MT_REF },
307  { "RX", "", MT_REF_DBID }, // @@@ extract field 'pubmed_id' ?
308 
309  { "AC", "acc", MT_BASIC },
310  { "AH", "assembly_header", MT_BASIC },
311  { "AS", "assembly_info", MT_BASIC },
312  { "CC", "comment", MT_BASIC },
313  { "CO", "contig", MT_BASIC },
314  { "DE", "description", MT_BASIC },
315  { "DR", "db_xref", MT_BASIC },
316  { "DT", "date", MT_BASIC },
317  { "SV", "version", MT_BASIC },
318  { "KW", "keywd", MT_BASIC },
319  { "OS", "full_name", MT_BASIC },
320  { "OC", "tax", MT_BASIC },
321  { "OG", "organelle", MT_BASIC },
322  { "PR", "projref", MT_BASIC },
323 
324  { "FH", "", MT_FEATURE_START },
325  { "FT", "", MT_FEATURE },
326  { "SQ", "", MT_SEQUENCE_START },
327  { "//", "", MT_END },
328 
329  { "XX", "", MT_IGNORE }, // spacer
330 
331  { "", "", MT_IGNORE }, // End of array
332 };
333 
334 // --------------------------------------------------------------------------------
335 
336 
338  : Importer(Flatfile, DB_writer, genebank_meta_description)
339 {}
340 
341 bool GenebankImporter::readFeatureTableLine(string& line) {
342  if (flatfile.getLine(line)) {
343  if (beginsWith(line, " ")) {
344  return true;
345  }
346  flatfile.backLine(line);
347  }
348  return false;
349 }
350 
351 static bool splitGenebankTag(const string& line, string& tag, string& content) {
352  // split a line into tag (incl. preceding spaces) and content
353  // returns true, if line suffices the format requirements
354  // Note: returns tag="" at wrapped lines
355 
356  string::size_type first_non_space = line.find_first_not_of(' ');
357 
358  if (first_non_space == 12 || // no tag, only content
359  (first_non_space == string::npos && line.length() == 12)) { // same with empty content
360  tag = "";
361  content = line.substr(12);
362  return true;
363  }
364 
365  if (first_non_space>12) return false;
366 
367  string::size_type behind_tag = line.find_first_of(' ', first_non_space);
368  if (behind_tag == string::npos) { // only tag w/o spaces behind
369  tag = line;
370  content = "";
371  return true;
372  }
373 
374  string::size_type content_start = line.find_first_not_of(' ', behind_tag);
375  if (content_start == string::npos) { // line w/o content
376  content = "";
377  }
378  else {
379  content = line.substr(content_start);
380  }
381 
382  tag = line.substr(0, behind_tag);
383  return true;
384 }
385 
386 static long scanSeqlenFromLOCUS(const string& locusContent) {
387  StringParser parser(locusContent);
388  parser.extractWord(); // id
389  parser.eatSpaces();
390 
391  long bp = parser.extractNumber();
392  parser.eatSpaces();
393  parser.expectContent("bp");
394 
395  return bp;
396 }
397 
398 void GenebankImporter::import_section() {
399  MetaInfo meta;
400  References refs;
401 
402  const MetaTag *prevTag = NULp; // previously handled tag
403  string prevContent; // previously found content
404 
405  bool seenHeaderLine = false;
406  bool EOS = false; // end of section ?
407 
408  // read header of file
409  while (!EOS) {
410  string line, tag, content;
411  expectLine(line);
412  if (!splitGenebankTag(line, tag, content)) {
413  gi_assert(0);
414  }
415 
416  if (tag.empty()) { // no tag - happens at wrapped lines
417  prevContent.append(1, ' ');
418  prevContent.append(content);
419  }
420  else { // start of new tag
421  const MetaTag *knownTag = findTag(tag);
422  if (!knownTag) throw GBS_global_string("Invalid tag '%s'", tag.c_str());
423 
424  if (prevTag) { // save previous tag
425  switch (prevTag->type) {
426  case MT_REF: refs.add(prevTag->field, prevContent); break;
427  case MT_BASIC: meta.add(prevTag, prevContent, true); break;
428  case MT_HEADER:
429  meta.add(prevTag, prevContent, true); // save header line
430  expectedSeqLength = scanSeqlenFromLOCUS(prevContent);
431  break;
432  case MT_REF_DBID: // embl only
433  default: gi_assert(0); break;
434  }
435  prevTag = NULp;
436  }
437 
438  switch (knownTag->type) {
439  case MT_HEADER:
440  if (seenHeaderLine) throw GBS_global_string("Multiple occurrences of tag '%s'", tag.c_str());
441  seenHeaderLine = true;
442  // fall-through
443  case MT_BASIC:
444  case MT_REF:
445  prevTag = knownTag;
446  prevContent = content;
447  break;
448 
449  case MT_REF_START:
450  refs.start(); // start a new reference
451  break;
452 
453  case MT_FEATURE_START:
456  break;
457 
458  case MT_SEQUENCE_START:
459  parseSequence(knownTag->tag, content);
460  EOS = true; // end of section
461  break;
462 
463  case MT_IGNORE:
464  break;
465 
466  case MT_END:
467  EOS = true;
468  break;
469 
470  case MT_CONTIG:
471  throw GBS_global_string("Cannot import files containing CONTIG");
472 
473  case MT_REF_DBID: // embl only
474  default:
475  gi_assert(0);
476  throw GBS_global_string("Tag '%s' not expected here", knownTag->tag.c_str());
477  }
478  }
479  }
480 
481  db_writer.finalizeOrganism(meta, refs, *this);
483 }
484 
485 // --------------------------------------------------------------------------------
486 
487 
489  : Importer(Flatfile, DB_writer, embl_meta_description)
490 {}
491 
492 static bool splitEmblTag(const string& line, string& tag, string& content) {
493  // split a line into 2-character tag and content
494  // return true on success (i.e. if line suffices the required format)
495 
496  if (line.length() == 2) {
497  tag = line;
498  content = "";
499  }
500  else {
501  string::size_type spacer = line.find(" "); // separator between tag and content
502  if (spacer != 2) return false; // expect spacer at pos 2-4
503 
504  tag = line.substr(0, 2);
505  content = line.substr(5);
506  }
507 
508  return true;
509 }
510 
511 bool EmblImporter::readFeatureTableLine(string& line) {
512  if (flatfile.getLine(line)) {
513  if (beginsWith(line, "FT ")) {
514  return true;
515  }
516  flatfile.backLine(line);
517  }
518  return false;
519 }
520 
521 static long scanSeqlenFromID(const string& idContent) {
522  StringParser parser(idContent);
523  string lastWord = parser.extractWord(); // eat id
524  bool bpseen = false;
525  long bp = -1;
526 
527  while (!bpseen) {
528  parser.eatSpaces();
529  string word = parser.extractWord();
530  if (word == "BP.") {
531  // basecount is in word before "BP."
532  bp = atol(lastWord.c_str());
533  bpseen = true;
534  }
535  else {
536  lastWord = word;
537  }
538  }
539 
540  if (bp == -1) throw "Could not parse bp from header";
541 
542  return bp;
543 }
544 
545 void EmblImporter::import_section() {
546  MetaInfo meta;
547  References refs;
548 
549  const MetaTag *prevTag = NULp; // previously handled tag
550  string prevContent; // previously found content
551  bool prevAppendNL = false; // append '\n' into multiline tags
552 
553  bool seenHeaderLine = false;
554  bool EOS = false; // end of section ?
555 
556  // read header of file
557  while (!EOS) {
558  string line, tag, content;
559  expectLine(line);
560  if (!splitEmblTag(line, tag, content)) {
561  throw "Expected two-character tag at start of line";
562  }
563 
564  const MetaTag *knownTag = findTag(tag);
565  if (!knownTag) throw GBS_global_string("Invalid tag '%s'", tag.c_str());
566 
567  if (knownTag == prevTag) { // multiline tag
568  if (prevAppendNL) prevContent.append("\n"); // append a newline to make parsing in add_dbid() more easy
569  prevContent.append(content); // append w/o space - EMBL flatfiles have spaces at EOL when needed
570  }
571  else { // start of new tag
572  if (prevTag) { // save previous tag
573  switch (prevTag->type) {
574  case MT_REF: refs.add(prevTag->field, prevContent); break;
575  case MT_REF_DBID: refs.add_dbid(prevContent); prevAppendNL = false; break;
576  case MT_BASIC: meta.add(prevTag, prevContent, true); break;
577  case MT_HEADER:
578  meta.add(prevTag, prevContent, true);
579  expectedSeqLength = scanSeqlenFromID(prevContent);
580  break;
581  default: gi_assert(0); break;
582  }
583  prevTag = NULp;
584  }
585 
586  switch (knownTag->type) {
587  case MT_HEADER:
588  if (seenHeaderLine) throw GBS_global_string("Multiple occurrences of tag '%s'", tag.c_str());
589  seenHeaderLine = true;
590  // fall-through
591  case MT_BASIC:
592  case MT_REF:
593  prevTag = knownTag;
594  prevContent = content;
595  break;
596 
597  case MT_REF_DBID:
598  prevTag = knownTag;
599  prevContent = content;
600  prevAppendNL = true;
601  break;
602 
603  case MT_REF_START:
604  refs.start(); // start a new reference
605  break;
606 
607  case MT_FEATURE:
608  flatfile.backLine(line);
611  break;
612 
613  case MT_SEQUENCE_START:
614  parseSequence(content);
615  EOS = true; // end of section
616  break;
617 
618  case MT_FEATURE_START:
619  case MT_IGNORE:
620  break;
621 
622  default:
623  gi_assert(0);
624  throw GBS_global_string("Tag '%s' not expected here", knownTag->tag.c_str());
625  }
626  }
627  }
628  db_writer.finalizeOrganism(meta, refs, *this);
630 }
631 
632 // --------------------------------------------------------------------------------
633 // sequence readers:
634 
635 inline bool parseCounter(bool expect, BaseCounter& headerCount, StringParser& parser, Base base, const char *word) {
636  // parses part of string (e.g. " 6021225 BP;" or " 878196 A;")
637  // if 'expect' == true -> throw exception if missing
638  // if 'expect' == false -> return false if missing
639 
640  bool found = false;
641  stringCIter start = parser.getPosition();
642 
643  parser.expectSpaces(0);
644 
645  bool seen_number;
646  long count = parser.eatNumber(seen_number);
647 
648  if (seen_number) {
649  headerCount.addCount(base, count);
650  size_t spaces = parser.eatSpaces();
651  if (spaces>0) {
652  size_t len = parser.lookingAt(word);
653  if (len>0) { // seen
654  parser.advance(len);
655  found = true;
656  }
657  }
658  }
659 
660  if (!found) {
661  parser.setPosition(start); // reset position
662  if (expect) throw GBS_global_string("Expected counter '### %s', found '%s'", word, parser.rest().c_str());
663  }
664  return found;
665 }
666 
667 void GenebankImporter::parseSequence(const string& tag, const string& headerline) {
668  SmartPtr<BaseCounter> headerCount;
669 
670  if (tag == "BASE") { // base count not always present
671  // parse headerline :
672  headerCount = new BaseCounter("sequence header");
673  {
674  StringParser parser(headerline);
675 
676  parser.expectContent("COUNT");
677 
678  parseCounter(true, *headerCount, parser, BC_A, "a");
679  parseCounter(true, *headerCount, parser, BC_C, "c");
680  parseCounter(true, *headerCount, parser, BC_G, "g");
681  parseCounter(true, *headerCount, parser, BC_T, "t");
682  parseCounter(false, *headerCount, parser, BC_OTHER, "others"); // not always present
683 
684  headerCount->calcOverallCounter();
685  }
686  }
687 
688  // parse sequence data
689  size_t est_seq_size = headerCount.isNull() ? 500000 : headerCount->getCount(BC_ALL);
690  SequenceBuffer seqData(est_seq_size);
691  {
692  string line;
693 
694  if (!headerCount.isNull()) {
695  // if BASE COUNT was present, check ORIGIN line
696  // otherwise ORIGIN line has already been read
697  expectLine(line);
698  if (!beginsWith(line, "ORIGIN")) throw "Expected 'ORIGIN'";
699  }
700 
701  bool eos_seen = false;
702  while (!eos_seen) {
703  expectLine(line);
704  if (beginsWith(line, "//")) {
705  eos_seen = true;
706  }
707  else {
708  string data;
709  data.reserve(60);
710  StringParser parser(line);
711 
712  parser.eatSpaces(); // not sure whether there really have to be spaces if number has 9 digits or more
713  size_t cur_pos = (size_t)parser.extractNumber();
714  size_t datasize = seqData.getBaseCounter().getCount(BC_ALL);
715 
716  if (cur_pos != (datasize+1)) {
717  throw GBS_global_string("Got wrong base position (found=%zu, expected=%zu)", cur_pos, size_t(datasize+1));
718  }
719 
720  int blocks = 0;
721  while (!parser.atEnd() && parser.at() == ' ') {
722  parser.expectSpaces(1);
723 
724  stringCIter start = parser.pos;
725  stringCIter end = parser.find(' ');
726 
727  data.append(start, end);
728  blocks++;
729  }
730 
731  if (blocks>6) throw "Found more than 6 parts of sequence data";
732  seqData.addLine(data);
733  }
734  }
735  }
736 
737  if (headerCount.isNull()) {
738  warning("No 'BASE COUNT' found. Base counts have not been validated.");
739  }
740  else {
741  headerCount->expectEqual(seqData.getBaseCounter());
742  }
743  db_writer.writeSequence(seqData);
744 }
745 
746 void EmblImporter::parseSequence(const string& headerline) {
747  // parse headerline:
748  BaseCounter headerCount("sequence header");
749  {
750  StringParser parser(headerline);
751 
752  parser.expectContent("Sequence");
753 
754  parseCounter(true, headerCount, parser, BC_ALL, "BP;");
755  parseCounter(true, headerCount, parser, BC_A, "A;");
756  parseCounter(true, headerCount, parser, BC_C, "C;");
757  parseCounter(true, headerCount, parser, BC_G, "G;");
758  parseCounter(true, headerCount, parser, BC_T, "T;");
759  parseCounter(true, headerCount, parser, BC_OTHER, "other;");
760 
761  headerCount.checkOverallCounter();
762  }
763 
764  // parse sequence data
765  SequenceBuffer seqData(headerCount.getCount(BC_ALL));
766  {
767  bool eos_seen = false;
768  string line;
769 
770  while (!eos_seen) {
771  expectLine(line);
772  if (beginsWith(line, "//")) {
773  eos_seen = true;
774  }
775  else {
776  string data;
777  data.reserve(60);
778  StringParser parser(line);
779 
780  parser.expectSpaces(5, false);
781  int blocks = 0;
782  while (!parser.atEnd() && isalpha(parser.at())) {
783  stringCIter start = parser.pos;
784  stringCIter end = parser.find(' ');
785 
786  data.append(start, end);
787  blocks++;
788  parser.expectSpaces(1);
789  }
790 
791  if (blocks>6) throw "Found more than 6 parts of sequence data";
792 
793  size_t basecount = (size_t)parser.extractNumber();
794 
795  seqData.addLine(data);
796  size_t datasize = seqData.getBaseCounter().getCount(BC_ALL);
797 
798  if (basecount != datasize) {
799  throw GBS_global_string("Got wrong base counter(found=%zu, expected=%zu)", basecount, datasize);
800  }
801  }
802  }
803  }
804 
805  headerCount.expectEqual(seqData.getBaseCounter());
806  db_writer.writeSequence(seqData);
807 }
808 
void backFeatureTableLine(FeatureLinePtr &fline)
Definition: Importer.h:73
void calcOverallCounter()
void show_warnings(const string &import_of_what)
Definition: Importer.cxx:222
GB_TYPES type
void writeSequence(const SequenceBuffer &seqData)
Definition: DBwriter.cxx:198
FeatureLine(const string &line)
Definition: Importer.cxx:33
bool getLine(string &line)
void add(const MetaTag *meta, const std::string &content, bool allow_multiple_entries)
Definition: MetaInfo.cxx:173
void writeFeature(const Feature &feature, long seqLength)
Definition: DBwriter.cxx:149
#define gi_assert(cond)
Definition: defs.h:26
bool parseCounter(bool expect, BaseCounter &headerCount, StringParser &parser, Base base, const char *word)
Definition: Importer.cxx:635
static MetaTag embl_meta_description[]
Definition: Importer.cxx:297
void expectLine(string &line)
Definition: Importer.h:67
virtual bool readFeatureTableLine(string &line)=0
void add_dbid(const std::string &content)
Definition: MetaInfo.cxx:122
size_t lookingAt(const char *content)
Definition: tools.h:79
static bool splitGenebankTag(const string &line, string &tag, string &content)
Definition: Importer.cxx:351
GenebankImporter(LineReader &Flatfile, DBwriter &DB_writer)
Definition: Importer.cxx:337
const char * GBS_global_string(const char *templat,...)
Definition: arb_msg.cxx:204
STL namespace.
std::string tag
Definition: MetaTag.h:33
void expectEqual(const BaseCounter &other) const
size_t getCount(Base base) const
void expectContent(const char *content)
Definition: tools.h:93
Importer(LineReader &Flatfile, DBwriter &DB_writer, const MetaTag *meta_description)
Definition: Importer.cxx:121
bool isNull() const
test if SmartPtr is NULp
Definition: smartptr.h:248
void finalizeOrganism(const MetaInfo &meta, const References &refs, Importer &importer)
Definition: DBwriter.cxx:380
Definition: MetaTag.h:28
void warning(const char *msg)
Definition: Importer.cxx:128
static HelixNrInfo * start
size_t eatSpaces()
Definition: tools.h:58
void GB_warningf(const char *templat,...)
Definition: arb_msg.cxx:490
FeatureLinePtr getFeatureTableLine()
Definition: Importer.cxx:132
const std::string & getAccessionNumber() const
Definition: MetaInfo.cxx:186
virtual void import_section()=0
void backLine(const string &line)
Generic smart pointer.
Definition: smartptr.h:149
LineReader & flatfile
Definition: Importer.h:61
string lineError(const string &msg) const
stringVector warnings
Definition: Importer.h:64
EmblImporter(LineReader &Flatfile, DBwriter &DB_writer)
Definition: Importer.cxx:488
void createOrganism(const string &flatfile, const char *importerTag)
Definition: DBwriter.cxx:87
std::string rest() const
Definition: tools.h:48
void advance(size_t offset)
Definition: tools.h:46
long extractNumber()
Definition: tools.h:123
static long scanSeqlenFromLOCUS(const string &locusContent)
Definition: Importer.cxx:386
FeatureLines pushedFeatureLines
Definition: Importer.h:63
std::string extractWord(const char *delimiter=" ")
Definition: tools.h:99
static MetaTag genebank_meta_description[]
Definition: Importer.cxx:267
static long scanSeqlenFromID(const string &idContent)
Definition: Importer.cxx:521
FeatureLinePtr getUnwrappedFeatureTableLine()
Definition: Importer.cxx:146
const MetaTag * findTag(const string &tag)
Definition: Importer.h:68
std::string field
Definition: MetaTag.h:34
long expectedSeqLength
Definition: Importer.h:65
bool beginsWith(const std::string &str, const std::string &start)
Definition: arb_stdstr.h:22
void add(const std::string &field, const std::string &content)
Definition: MetaInfo.h:43
virtual const string & getFilename() const =0
static bool splitEmblTag(const string &line, string &tag, string &content)
Definition: Importer.cxx:492
FeaturePtr parseFeature()
Definition: Importer.cxx:191
void parseFeatureTable()
Definition: Importer.cxx:211
void setPosition(const stringCIter &position)
Definition: tools.h:45
size_t expectSpaces(size_t count=1, bool allowMore=true)
Definition: tools.h:67
static int line
Definition: arb_a2ps.c:296
#define NULp
Definition: cxxforward.h:97
const string & getMessage() const
Definition: DBwriter.h:47
DBwriter & db_writer
Definition: Importer.h:60
bool reinterpret_as_continued_line()
Definition: Importer.cxx:105
void addCount(Base base, size_t amount)
long eatNumber(bool &eaten)
Definition: tools.h:110
Base
stringCIter getPosition() const
Definition: tools.h:44
void start()
Definition: MetaInfo.cxx:46
void import()
Definition: Importer.cxx:234
Definition: MetaTag.h:22
static bool is_escaped(const string &str, size_t pos)
Definition: Importer.cxx:21
MetaTagType type
Definition: MetaTag.h:35