d6/d83/Importer_8cxx_source.html

 // ================================================================ //

 //                                                                  //

 //   File      : Importer.cxx                                       //

 //   Purpose   : Genome importer core                               //

 //                                                                  //

 //   Coded by Ralf Westram (coder@reallysoft.de) in November 2006   //

 //   Institute of Microbiology (Technical University Munich)        //

 //   http://www.arb-home.de/                                        //

 //                                                                  //

 // ================================================================ //


 #include "tools.h"

 #include "DBwriter.h"

 #include <arbdb.h>

 #include <arb_stdstr.h>


 using namespace std;


 // --------------------------------------------------------------------------------


 static bool is_escaped(const string& str, size_t pos) {

     // returns true, if position 'pos' in string 'str' is escaped by '\\'


     bool escaped = false;

     if (pos != 0) { // pos 0 can't be escaped

         if (str[pos-1] == '\\') {                   // is an escape before pos ?

             escaped = !is_escaped(str, pos-1);   // pos is escaped, if the escape isn't!

         }

     }

     return escaped;

 }


 FeatureLine::FeatureLine(const string& line) {

     // start parsing at position 5

     string::size_type first_char = line.find_first_not_of(' ', 5);


     orgLine = line;


     if (first_char == 5) { // feature start

         string::size_type behind_name = line.find_first_of(' ', first_char);

         string::size_type rest_start = line.find_first_not_of(' ', behind_name);


         if (rest_start == string::npos) {

             if (behind_name == string::npos) throw "Expected space behind feature name";

             throw "Expected some content behind feature name";

         }


         name = line.substr(first_char, behind_name-first_char);

         rest = line.substr(rest_start);

         type = FL_START;

     }

     else if (first_char >= 21) { // not feature start

         if (first_char == 21 && line[first_char] == '/') { // qualifier start

             string::size_type equal_pos = line.find_first_of('=', first_char);

             if (equal_pos == string::npos) {

                 // qualifier w/o data (i.e. "/pseudo")

                 name = line.substr(first_char+1);

                 rest = "true";

                 type = FL_QUALIFIER_NODATA;

             }

             else {

                 name = line.substr(first_char+1, equal_pos-first_char-1);

                 rest = line.substr(equal_pos+1);


                 if (rest[0] == '"') {

                     size_t rlen = rest.length();


                     if (rlen == 1) {                // special case: only one open quote behind qualifier

                         type = FL_QUALIFIER_QUOTE_OPENED;

                     }

                     else if (rest[rlen-1] == '"' && !is_escaped(rest, rlen-1)) { // closing non-escaped quote at eol

                         type = FL_QUALIFIER_QUOTED;

                     }

                     else {

                         type = FL_QUALIFIER_QUOTE_OPENED;

                     }

                 }

                 else {

                     type = FL_QUALIFIER;

                 }

             }

         }

         else {                             // continued line

             interpret_as_continued_line();

         }

     }

     else {

         if (first_char == string::npos) {

             throw "Expected feature line, found empty line";

         }

         throw GBS_global_string("Expected feature line (first char at pos=%zu unexpected)", first_char);

     }

 }


 void FeatureLine::interpret_as_continued_line() {

     rest = orgLine.substr(21);

     if (rest[rest.length()-1] == '"') {

         type = FL_CONTINUED_QUOTE_CLOSED;

     }

     else {

         type = FL_CONTINUED;

     }

 }


 bool FeatureLine::reinterpret_as_continued_line() {

     bool ok = false;


     if (type == FL_QUALIFIER || type == FL_QUALIFIER_NODATA) {

         string::size_type first_char = orgLine.find_first_not_of(' ', 5);

         if (first_char >= 21) {

             interpret_as_continued_line();

             ok = true;

         }

     }


     return ok;

 }


 // --------------------------------------------------------------------------------


 Importer::Importer(LineReader& Flatfile, DBwriter& DB_writer, const MetaTag *meta_description)

     : db_writer(DB_writer),

       flatfile(Flatfile),

       tagTranslator(meta_description),

       expectedSeqLength(-1)

 {}


 void Importer::warning(const char *msg) {

     warnings.push_back(msg);

 }


 FeatureLinePtr Importer::getFeatureTableLine() {

     FeatureLinePtr fline;


     if (pushedFeatureLines.empty()) { // nothing on stack -> read new

         string line;

         if (readFeatureTableLine(line)) fline = new FeatureLine(line);

     }

     else {

         fline = pushedFeatureLines.back();

         pushedFeatureLines.pop_back();

     }

     return fline;

 }


 FeatureLinePtr Importer::getUnwrappedFeatureTableLine() {

     FeatureLinePtr fline = getFeatureTableLine();

     if (!fline.isNull()) {

         if (fline->type & FL_META_CONTINUED) throw "Expected start of feature or qualifier";


         if (0 == (fline->type & (FL_QUALIFIER_NODATA|FL_QUALIFIER_QUOTED))) {

             // qualifier/featurestart may be wrapped

             FeatureLinePtr next_fline = getFeatureTableLine();


             while (!next_fline.isNull() &&

                    fline->type != FL_QUALIFIER_QUOTED) // already seen closing quote

             {

                 if ((next_fline->type&FL_META_CONTINUED) == 0) {

                     // special case: a wrapped line of a quoted qualifier may start with /xxx

                     // (in that case it is misinterpreted as qualifier start)

                     if (fline->type == FL_QUALIFIER_QUOTE_OPENED) {

                         if (!next_fline->reinterpret_as_continued_line()) {

                             throw "did not see end of quoted qualifier (instead found next qualifiert)";

                         }

                         gi_assert(next_fline->type & FL_META_CONTINUED);

                     }

                     else {

                         break;

                     }

                 }


                 if (next_fline->type == FL_CONTINUED_QUOTE_CLOSED) {

                     if (fline->type != FL_QUALIFIER_QUOTE_OPENED) throw "Unexpected closing quote";

                     fline->type = FL_QUALIFIER_QUOTED;

                 }

                 else {

                     gi_assert(next_fline->type == FL_CONTINUED);

                     gi_assert(fline->type == FL_START || fline->type == FL_QUALIFIER || fline->type == FL_QUALIFIER_QUOTE_OPENED);

                 }


                 fline->rest.append(next_fline->rest);

                 next_fline = getFeatureTableLine();

             }


             if (!next_fline.isNull()) backFeatureTableLine(next_fline);

         }

     }

     return fline;

 }


 FeaturePtr Importer::parseFeature() {

     FeaturePtr     feature;

     FeatureLinePtr fline = getUnwrappedFeatureTableLine();


     if (!fline.isNull()) {         // found a feature table line

         if (fline->type != FL_START) throw "Expected feature start";


         feature = new Feature(fline->name, fline->rest);


         fline = getUnwrappedFeatureTableLine();

         while (!fline.isNull() && (fline->type & FL_META_QUALIFIER)) {

             feature->addQualifiedEntry(fline->name, fline->rest);

             fline = getUnwrappedFeatureTableLine();

         }

         if (!fline.isNull()) backFeatureTableLine(fline);

     }


     return feature;

 }


 void Importer::parseFeatureTable() {

     FeaturePtr feature = parseFeature();


     while (!feature.isNull()) {

         feature->expectLocationInSequence(expectedSeqLength);

         feature->fixEmptyQualifiers();

         db_writer.writeFeature(*feature, expectedSeqLength);

         feature = parseFeature();

     }

 }


 void Importer::show_warnings(const string& import_of_what) {

     if (!warnings.empty()) {

         const char         *what = import_of_what.c_str();

         stringVectorCRIter  e    = warnings.rend();

         for (stringVectorCRIter i = warnings.rbegin(); i != e; ++i) {

             GB_warningf("Warning: %s: %s", what, i->c_str());

         }

         warnings.clear();

     }

 }


 void Importer::import() {

     try {

         string line;

         while (flatfile.getLine(line)) {

             if (!line.empty()) { // silently skip empty lines before or after section

                 flatfile.backLine(line);


                 // cleanup from import of previous section

                 gi_assert(pushedFeatureLines.empty()); // oops - somehow forgot a feature

                 pushedFeatureLines.clear();

                 warnings.clear();


                 expectedSeqLength = 0; // reset expected seq. length

                 import_section();


                 gi_assert(warnings.empty());

                 gi_assert(pushedFeatureLines.empty()); // oops - somehow forgot a feature

             }

         }

     }

     catch (const DBerror& err) { throw err.getMessage(); }

     catch (const string& err) { throw flatfile.lineError(err); }

     catch (const char *err) { throw flatfile.lineError(err); }

 }


 void Importer::check_base_counters(const SequenceBuffer& seqData, const BaseCounter *headerCount) {

     const BaseCounter& baseCounter = seqData.getBaseCounter();

     if (baseCounter.getCount(BC_ALL)<1) {

         warning("Sequence data is empty (only metadata found).");

     }

     if (!headerCount) {

         gi_assert(dynamic_cast<GenebankImporter*>(this) != NULp); // this case shall only happen with genebank files.

         warning("No 'BASE COUNT' found. Base counts have not been validated.");

     }

     else {

         headerCount->expectEqual(baseCounter);

     }

 }


 // --------------------------------------------------------------------------------

 // Meta information definitions

 //

 //

 // [ please keep the list of common entries in

 //      ../HELP_SOURCE/source/sp_info.hlp@"Commonly used database entries"

 //   up to date! ]


 static MetaTag genebank_meta_description[] = {

     { "LOCUS",     "org_locus",   MT_HEADER },


     { "REFERENCE", "",            MT_REF_START },

     { "  AUTHORS", "author",      MT_REF },

     { "  TITLE",   "title",       MT_REF },

     { "  CONSRTM", "refgrp",      MT_REF },

     { "  JOURNAL", "journal",     MT_REF },

     { "   PUBMED", "pubmed_id",   MT_REF },

     { "  MEDLINE", "medline_id",  MT_REF },

     { "  REMARK",  "refremark",   MT_REF },


     { "DEFINITION", "definition", MT_BASIC },

     { "ACCESSION",  "acc",        MT_BASIC },

     { "VERSION",    "version",    MT_BASIC },

     { "DBLINK",     "db_xref",    MT_BASIC },

     { "KEYWORDS",   "keywd",      MT_BASIC },

     { "SOURCE",     "full_name",  MT_BASIC },

     { "  ORGANISM", "tax",        MT_BASIC },

     { "COMMENT",    "comment",    MT_BASIC },

     { "PROJECT",    "projref",    MT_BASIC },


     { "FEATURES", "", MT_FEATURE_START },

     { "CONTIG",   "", MT_CONTIG },

     { "BASE",     "", MT_SEQUENCE_START }, // BASE COUNT (sometimes missing)

     { "ORIGIN",   "", MT_SEQUENCE_START }, // only used if BASE COUNT is missing

     { "//",       "", MT_END },


     { "", "", MT_IGNORE },      // End of array

 };


 static MetaTag embl_meta_description[] = {

     { "ID", "org_id",          MT_HEADER },


     { "RN", "",                MT_REF_START },

     { "RA", "author",          MT_REF },

     { "RC", "auth_comm",       MT_REF },

     { "RG", "refgrp",          MT_REF },

     { "RL", "journal",         MT_REF },

     { "RP", "nuc_rp",          MT_REF },

     { "RT", "title",           MT_REF },

     { "RX", "",                MT_REF_DBID }, // @@@ extract field 'pubmed_id' ?


     { "AC", "acc",             MT_BASIC },

     { "AH", "assembly_header", MT_BASIC },

     { "AS", "assembly_info",   MT_BASIC },

     { "CC", "comment",         MT_BASIC },

     { "CO", "contig",          MT_BASIC },

     { "DE", "description",     MT_BASIC },

     { "DR", "db_xref",         MT_BASIC },

     { "DT", "date",            MT_BASIC },

     { "SV", "version",         MT_BASIC },

     { "KW", "keywd",           MT_BASIC },

     { "OS", "full_name",       MT_BASIC },

     { "OC", "tax",             MT_BASIC },

     { "OG", "organelle",       MT_BASIC },

     { "PR", "projref",         MT_BASIC },


     { "FH", "", MT_FEATURE_START },

     { "FT", "", MT_FEATURE },

     { "SQ", "", MT_SEQUENCE_START },

     { "//", "", MT_END },


     { "XX", "", MT_IGNORE }, // spacer


     { "", "", MT_IGNORE }, // End of array

 };


 // --------------------------------------------------------------------------------


 GenebankImporter::GenebankImporter(LineReader& Flatfile, DBwriter& DB_writer)

     : Importer(Flatfile, DB_writer, genebank_meta_description)

 {}


 bool GenebankImporter::readFeatureTableLine(string& line) {

     if (flatfile.getLine(line)) {

         if (beginsWith(line, "     ")) {

             return true;

         }

         flatfile.backLine(line);

     }

     return false;

 }


 static bool splitGenebankTag(const string& line, string& tag, string& content) {

     // split a line into tag (incl. preceding spaces) and content

     // returns true, if line suffices the format requirements (currently never returns false!)

     // Note: returns tag="" at wrapped lines


     string::size_type first_non_space = line.find_first_not_of(' ');


     if (first_non_space >= 12 || // no tag, only content

         (first_non_space == string::npos && line.length() == 12)) { // same with empty content

         tag     = "";

         content = line.substr(12);

         return true;

     }


     gi_assert(first_non_space<12);


     string::size_type behind_tag = line.find_first_of(' ', first_non_space);

     if (behind_tag == string::npos) { // only tag w/o spaces behind

         tag     = line;

         content = "";

         return true;

     }


     string::size_type content_start = line.find_first_not_of(' ', behind_tag);

     if (content_start == string::npos) { // line w/o content

         content = "";

     }

     else {

         content = line.substr(content_start);

     }


     tag = line.substr(0, behind_tag);

     return true;

 }


 static long scanSeqlenFromLOCUS(const string& locusContent) {

     StringParser parser(locusContent);

     parser.extractWord();       // id

     parser.eatSpaces();


     long bp = parser.extractNumber();

     parser.eatSpaces();

     parser.expectContent("bp");


     return bp;

 }


 void GenebankImporter::import_section() {

     MetaInfo   meta;

     References refs;


     const MetaTag *prevTag = NULp; // previously handled tag

     string         prevContent;    // previously found content


     bool seenHeaderLine = false;

     bool EOS            = false; // end of section ?


     // read header of file

     while (!EOS) {

         string line, tag, content;

         expectLine(line);

         if (!splitGenebankTag(line, tag, content)) {

             gi_assert(0);

         }


         if (tag.empty()) {      // no tag - happens at wrapped lines

             if (!content.empty()) { // do not append empty 'lines' from wrapped tag-entries.

                 prevContent.append(1, ' ');

                 prevContent.append(content);

             }

         }

         else { // start of new tag

             const MetaTag *knownTag = findTag(tag);

             if (!knownTag) throw GBS_global_string("Invalid tag '%s'", tag.c_str());


             if (prevTag) { // save previous tag

                 switch (prevTag->type) {

                     case MT_REF:        refs.add(prevTag->field, prevContent); break;

                     case MT_BASIC:      meta.add(prevTag, prevContent, true); break;

                     case MT_HEADER:

                         meta.add(prevTag, prevContent, true); // save header line

                         expectedSeqLength = scanSeqlenFromLOCUS(prevContent);

                         break;

                     case MT_REF_DBID: // embl only

                     default: gi_assert(0); break;

                 }

                 prevTag = NULp;

             }


             switch (knownTag->type) {

                 case MT_HEADER:

                     if (seenHeaderLine) throw GBS_global_string("Multiple occurrences of tag '%s'", tag.c_str());

                     seenHeaderLine = true;

                     // fall-through

                 case MT_BASIC:

                 case MT_REF:

                     prevTag     = knownTag;

                     prevContent = content;

                     break;


                 case MT_REF_START:

                     refs.start(); // start a new reference

                     break;


                 case MT_FEATURE_START:

                     db_writer.createOrganism(flatfile.getFilename(), "NCBI");

                     parseFeatureTable();

                     break;


                 case MT_SEQUENCE_START:

                     parseSequence(knownTag->tag, content);

                     EOS = true; // end of section

                     break;


                 case MT_IGNORE:

                     break;


                 case MT_END:

                     EOS = true;

                     break;


                 case MT_CONTIG:

                     throw GBS_global_string("Cannot import files containing CONTIG");


                 case MT_REF_DBID: // embl only

                 default:

                     gi_assert(0);

                     throw GBS_global_string("Tag '%s' not expected here", knownTag->tag.c_str());

             }

         }

     }


     db_writer.finalizeOrganism(meta, refs, *this);

     show_warnings(meta.getAccessionNumber());

 }


 // --------------------------------------------------------------------------------


 EmblImporter::EmblImporter(LineReader& Flatfile, DBwriter& DB_writer)

     : Importer(Flatfile, DB_writer, embl_meta_description)

 {}


 static bool splitEmblTag(const string& line, string& tag, string& content) {

     // split a line into 2-character tag and content

     // return true on success (i.e. if line suffices the required format)


     if (line.length() == 2) {

         tag   = line;

         content = "";

     }

     else {

         string::size_type spacer = line.find("   "); // separator between tag and content

         if (spacer != 2) return false; // expect spacer at pos 2-4


         tag     = line.substr(0, 2);

         content = line.substr(5);

     }


     return true;

 }


 bool EmblImporter::readFeatureTableLine(string& line) {

     if (flatfile.getLine(line)) {

         if (beginsWith(line, "FT   ")) {

             return true;

         }

         flatfile.backLine(line);

     }

     return false;

 }


 static long scanSeqlenFromID(const string& idContent) {

     StringParser parser(idContent);

     string       lastWord = parser.extractWord(); // eat id

     bool         bpseen   = false;

     long         bp       = -1;


     while (!bpseen) {

         parser.eatSpaces();

         string word = parser.extractWord();

         if (word == "BP.") {

             //  basecount is in word before "BP."

             bp     = atol(lastWord.c_str());

             bpseen = true;

         }

         else {

             lastWord = word;

         }

     }


     if (bp == -1) throw "Could not parse bp from header";


     return bp;

 }


 void EmblImporter::import_section() {

     MetaInfo   meta;

     References refs;


     const MetaTag *prevTag      = NULp;             // previously handled tag

     string         prevContent;                     // previously found content

     bool           prevAppendNL = false;            // append '\n' into  multiline tags


     bool seenHeaderLine = false;

     bool EOS            = false; // end of section ?


     // read header of file

     while (!EOS) {

         string line, tag, content;

         expectLine(line);

         if (!splitEmblTag(line, tag, content)) {

             throw "Expected two-character tag at start of line";

         }


         const MetaTag *knownTag = findTag(tag);

         if (!knownTag) throw GBS_global_string("Invalid tag '%s'", tag.c_str());


         if (knownTag == prevTag) {                  // multiline tag

             if (prevAppendNL) prevContent.append("\n"); // append a newline to make parsing in add_dbid() more easy

             prevContent.append(content);            // append w/o space - EMBL flatfiles have spaces at EOL when needed

         }

         else {                                      // start of new tag

             if (prevTag) {                          // save previous tag

                 switch (prevTag->type) {

                     case MT_REF:        refs.add(prevTag->field, prevContent); break;

                     case MT_REF_DBID:   refs.add_dbid(prevContent); prevAppendNL = false; break;

                     case MT_BASIC:      meta.add(prevTag, prevContent, true); break;

                     case MT_HEADER:

                         meta.add(prevTag, prevContent, true);

                         expectedSeqLength = scanSeqlenFromID(prevContent);

                         break;

                     default: gi_assert(0); break;

                 }

                 prevTag = NULp;

             }


             switch (knownTag->type) {

                 case MT_HEADER:

                     if (seenHeaderLine) throw GBS_global_string("Multiple occurrences of tag '%s'", tag.c_str());

                     seenHeaderLine = true;

                     // fall-through

                 case MT_BASIC:

                 case MT_REF:

                     prevTag        = knownTag;

                     prevContent    = content;

                     break;


                 case MT_REF_DBID:

                     prevTag      = knownTag;

                     prevContent  = content;

                     prevAppendNL = true;

                     break;


                 case MT_REF_START:

                     refs.start(); // start a new reference

                     break;


                 case MT_FEATURE:

                     flatfile.backLine(line);

                     db_writer.createOrganism(flatfile.getFilename(), "EMBL");

                     parseFeatureTable();

                     break;


                 case MT_SEQUENCE_START:

                     parseSequence(content);

                     EOS = true; // end of section

                     break;


                 case MT_FEATURE_START:

                 case MT_IGNORE:

                     break;


                 default:

                     gi_assert(0);

                     throw GBS_global_string("Tag '%s' not expected here", knownTag->tag.c_str());

             }

         }

     }

     db_writer.finalizeOrganism(meta, refs, *this);

     show_warnings(meta.getAccessionNumber());

 }


 // --------------------------------------------------------------------------------

 // sequence readers:


 inline bool parseCounter(bool expect, BaseCounter& headerCount, StringParser& parser, Base base, const char *word) {

     // parses part of string (e.g. " 6021225 BP;" or " 878196 A;")

     // if 'expect' == true -> throw exception if missing

     // if 'expect' == false -> return false if missing


     bool        found = false;

     stringCIter start = parser.getPosition();


     parser.expectSpaces(0);


     bool seen_number;

     long count = parser.eatNumber(seen_number);


     if (seen_number) {

         headerCount.addCount(base, count);

         size_t spaces = parser.eatSpaces();

         if (spaces>0) {

             size_t len = parser.lookingAt(word);

             if (len>0) {                        // seen

                 parser.advance(len);

                 found = true;

             }

         }

     }


     if (!found) {

         parser.setPosition(start); // reset position

         if (expect) throw GBS_global_string("Expected counter '### %s', found '%s'", word, parser.rest().c_str());

     }

     return found;

 }


 void GenebankImporter::parseSequence(const string& tag, const string& headerline) {

     SmartPtr<BaseCounter> headerCount;


     if (tag == "BASE") { // base count not always present

         // parse headerline :

         headerCount = new BaseCounter("sequence header");

         {

             StringParser parser(headerline);


             parser.expectContent("COUNT");


             parseCounter(true, *headerCount, parser, BC_A, "a");

             parseCounter(true, *headerCount, parser, BC_C, "c");

             parseCounter(true, *headerCount, parser, BC_G, "g");

             parseCounter(true, *headerCount, parser, BC_T, "t");

             parseCounter(false, *headerCount, parser, BC_OTHER, "others"); // not always present


             headerCount->calcOverallCounter();

         }

     }


     // parse sequence data

     size_t         est_seq_size = headerCount.isNull() ? 500000 : headerCount->getCount(BC_ALL);

     SequenceBuffer seqData(est_seq_size);

     {

         string line;


         if (!headerCount.isNull()) {

             // if BASE COUNT was present, check ORIGIN line

             // otherwise ORIGIN line has already been read

             expectLine(line);

             if (!beginsWith(line, "ORIGIN")) throw "Expected 'ORIGIN'";

         }


         bool eos_seen = false;

         while (!eos_seen) {

             expectLine(line);

             if (beginsWith(line, "//")) {

                 eos_seen = true;

             }

             else {

                 string       data;

                 data.reserve(60);

                 StringParser parser(line);


                 parser.eatSpaces(); // not sure whether there really have to be spaces if number has 9 digits or more

                 size_t cur_pos  = (size_t)parser.extractNumber();

                 size_t datasize = seqData.getBaseCounter().getCount(BC_ALL);


                 if (cur_pos != (datasize+1)) {

                     throw GBS_global_string("Got wrong base position (found=%zu, expected=%zu)", cur_pos, size_t(datasize+1));

                 }


                 int blocks = 0;

                 while (!parser.atEnd() && parser.at() == ' ') {

                     parser.expectSpaces(1);


                     stringCIter start = parser.pos;

                     stringCIter end   = parser.find(' ');


                     data.append(start, end);

                     blocks++;

                 }


                 if (blocks>6) throw "Found more than 6 parts of sequence data";

                 seqData.addLine(data);

             }

         }

     }


     check_base_counters(seqData, headerCount.content());

     db_writer.writeSequence(seqData);

 }


 void EmblImporter::parseSequence(const string& headerline) {

     // parse headerline:

     BaseCounter  headerCount("sequence header");

     {

         StringParser parser(headerline);


         parser.expectContent("Sequence");


         parseCounter(true, headerCount, parser, BC_ALL,   "BP;");

         parseCounter(true, headerCount, parser, BC_A,     "A;");

         parseCounter(true, headerCount, parser, BC_C,     "C;");

         parseCounter(true, headerCount, parser, BC_G,     "G;");

         parseCounter(true, headerCount, parser, BC_T,     "T;");

         parseCounter(true, headerCount, parser, BC_OTHER, "other;");


         headerCount.checkOverallCounter();

     }


     // parse sequence data

     SequenceBuffer seqData(headerCount.getCount(BC_ALL));

     {

         bool   eos_seen = false;

         string line;


         while (!eos_seen) {

             expectLine(line);

             if (beginsWith(line, "//")) {

                 eos_seen = true;

             }

             else {

                 string data;

                 data.reserve(60);

                 StringParser parser(line);


                 parser.expectSpaces(5, false);

                 int blocks = 0;

                 while (!parser.atEnd() && isalpha(parser.at())) {

                     stringCIter start = parser.pos;

                     stringCIter end   = parser.find(' ');


                     data.append(start, end);

                     blocks++;

                     parser.expectSpaces(1);

                 }


                 if (blocks>6) throw "Found more than 6 parts of sequence data";


                 size_t basecount = (size_t)parser.extractNumber();


                 seqData.addLine(data);

                 size_t datasize = seqData.getBaseCounter().getCount(BC_ALL);


                 if (basecount != datasize) {

                     throw GBS_global_string("Got wrong base counter(found=%zu, expected=%zu)", basecount, datasize);

                 }

             }

         }

     }


     check_base_counters(seqData, &headerCount);

     db_writer.writeSequence(seqData);

 }


Importer::backFeatureTableLine
void backFeatureTableLine(FeatureLinePtr &fline)
Definition: Importer.h:73

BaseCounter::calcOverallCounter
void calcOverallCounter()
Definition: SequenceBuffer.cxx:30

Importer::show_warnings
void show_warnings(const string &import_of_what)
Definition: Importer.cxx:222

BC_G
Definition: SequenceBuffer.h:36

type
GB_TYPES type
Definition: item_sel_list.cxx:117

FL_CONTINUED_QUOTE_CLOSED
Definition: Importer.h:34

Importer::check_base_counters
void check_base_counters(const SequenceBuffer &seqData, const BaseCounter *headerCount)
Definition: Importer.cxx:259

DBwriter::writeSequence
void writeSequence(const SequenceBuffer &seqData)
Definition: DBwriter.cxx:198

BC_T
Definition: SequenceBuffer.h:36

MT_REF_START
Definition: MetaTag.h:21

FeatureLine::FeatureLine
FeatureLine(const string &line)
Definition: Importer.cxx:33

LineReader::getLine
bool getLine(string &line)
Definition: BufferedFileReader.h:68

MetaInfo::add
void add(const MetaTag *meta, const std::string &content, bool allow_multiple_entries)
Definition: MetaInfo.cxx:173

DBwriter::writeFeature
void writeFeature(const Feature &feature, long seqLength)
Definition: DBwriter.cxx:149

gi_assert
#define gi_assert(cond)
Definition: defs.h:26

parseCounter
bool parseCounter(bool expect, BaseCounter &headerCount, StringParser &parser, Base base, const char *word)
Definition: Importer.cxx:652

embl_meta_description
static MetaTag embl_meta_description[]
Definition: Importer.cxx:312

MetaInfo
Definition: MetaInfo.h:58

Importer::expectLine
void expectLine(string &line)
Definition: Importer.h:67

Importer::readFeatureTableLine
virtual bool readFeatureTableLine(string &line)=0

References::add_dbid
void add_dbid(const std::string &content)
Definition: MetaInfo.cxx:122

FL_QUALIFIER
Definition: Importer.h:30

StringParser::lookingAt
size_t lookingAt(const char *content)
Definition: tools.h:79

LineReader
Definition: BufferedFileReader.h:34

splitGenebankTag
static bool splitGenebankTag(const string &line, string &tag, string &content)
Definition: Importer.cxx:366

GenebankImporter::GenebankImporter
GenebankImporter(LineReader &Flatfile, DBwriter &DB_writer)
Definition: Importer.cxx:352

GBS_global_string
const char * GBS_global_string(const char *templat,...)
Definition: arb_msg.cxx:203

std
STL namespace.

size_t

MetaTag::tag
std::string tag
Definition: MetaTag.h:33

BaseCounter::expectEqual
void expectEqual(const BaseCounter &other) const
Definition: SequenceBuffer.cxx:89

BaseCounter::getCount
size_t getCount(Base base) const
Definition: SequenceBuffer.h:64

StringParser::expectContent
void expectContent(const char *content)
Definition: tools.h:93

Importer::Importer
Importer(LineReader &Flatfile, DBwriter &DB_writer, const MetaTag *meta_description)
Definition: Importer.cxx:121

arbdb.h

SmartPtr::isNull
bool isNull() const
test if SmartPtr is NULp
Definition: smartptr.h:248

MT_FEATURE_START
Definition: MetaTag.h:24

FeatureLine
Definition: Importer.h:42

DBwriter::finalizeOrganism
void finalizeOrganism(const MetaInfo &meta, const References &refs, Importer &importer)
Definition: DBwriter.cxx:380

MT_END
Definition: MetaTag.h:28

BC_C
Definition: SequenceBuffer.h:36

Importer::warning
void warning(const char *msg)
Definition: Importer.cxx:128

FL_QUALIFIER_QUOTE_OPENED
Definition: Importer.h:33

start
static HelixNrInfo * start
Definition: RNA3D_StructureData.cxx:30

FL_META_CONTINUED
Definition: Importer.h:39

StringParser::eatSpaces
size_t eatSpaces()
Definition: tools.h:58

GB_warningf
void GB_warningf(const char *templat,...)
Definition: arb_msg.cxx:536

StringParser
Definition: tools.h:35

Importer::getFeatureTableLine
FeatureLinePtr getFeatureTableLine()
Definition: Importer.cxx:132

MetaInfo::getAccessionNumber
const std::string & getAccessionNumber() const
Definition: MetaInfo.cxx:186

Importer::import_section
virtual void import_section()=0

LineReader::backLine
void backLine(const string &line)
Definition: BufferedFileReader.h:79

SmartPtr
Generic smart pointer.
Definition: smartptr.h:149

Importer::flatfile
LineReader & flatfile
Definition: Importer.h:61

FL_QUALIFIER_NODATA
Definition: Importer.h:31

LineReader::lineError
string lineError(const string &msg) const
Definition: BufferedFileReader.cxx:89

SequenceBuffer::getBaseCounter
const BaseCounter & getBaseCounter() const
Definition: SequenceBuffer.h:90

Importer::warnings
stringVector warnings
Definition: Importer.h:64

MT_SEQUENCE_START
Definition: MetaTag.h:26

EmblImporter::EmblImporter
EmblImporter(LineReader &Flatfile, DBwriter &DB_writer)
Definition: Importer.cxx:505

DBwriter::createOrganism
void createOrganism(const string &flatfile, const char *importerTag)
Definition: DBwriter.cxx:87

BaseCounter
Definition: SequenceBuffer.h:38

Feature
Definition: Feature.h:18

StringParser::rest
std::string rest() const
Definition: tools.h:48

StringParser::advance
void advance(size_t offset)
Definition: tools.h:46

StringParser::extractNumber
long extractNumber()
Definition: tools.h:123

scanSeqlenFromLOCUS
static long scanSeqlenFromLOCUS(const string &locusContent)
Definition: Importer.cxx:401

MT_HEADER
Definition: MetaTag.h:19

Importer::pushedFeatureLines
FeatureLines pushedFeatureLines
Definition: Importer.h:63

DBwriter.h

StringParser::extractWord
std::string extractWord(const char *delimiter=" ")
Definition: tools.h:99

genebank_meta_description
static MetaTag genebank_meta_description[]
Definition: Importer.cxx:281

scanSeqlenFromID
static long scanSeqlenFromID(const string &idContent)
Definition: Importer.cxx:538

Importer::getUnwrappedFeatureTableLine
FeatureLinePtr getUnwrappedFeatureTableLine()
Definition: Importer.cxx:146

Importer::findTag
const MetaTag * findTag(const string &tag)
Definition: Importer.h:68

BC_OTHER
Definition: SequenceBuffer.h:36

SmartPtr::content
const T * content() const
convert SmartPtr to plain old pointer (also works if isNull())
Definition: smartptr.h:263

MetaTag::field
std::string field
Definition: MetaTag.h:34

MT_IGNORE
Definition: MetaTag.h:29

Importer::expectedSeqLength
long expectedSeqLength
Definition: Importer.h:65

FL_START
Definition: Importer.h:28

beginsWith
bool beginsWith(const std::string &str, const std::string &start)
Definition: arb_stdstr.h:22

MT_BASIC
Definition: MetaTag.h:20

References::add
void add(const std::string &field, const std::string &content)
Definition: MetaInfo.h:43

BC_A
Definition: SequenceBuffer.h:36

LineReader::getFilename
virtual const string & getFilename() const =0

References
Definition: MetaInfo.h:31

splitEmblTag
static bool splitEmblTag(const string &line, string &tag, string &content)
Definition: Importer.cxx:509

Importer::parseFeature
FeaturePtr parseFeature()
Definition: Importer.cxx:191

Importer::parseFeatureTable
void parseFeatureTable()
Definition: Importer.cxx:211

DBwriter
Definition: DBwriter.h:53

FL_META_QUALIFIER
Definition: Importer.h:38

StringParser::setPosition
void setPosition(const stringCIter &position)
Definition: tools.h:45

MetaTag
Definition: MetaTag.h:32

FL_QUALIFIER_QUOTED
Definition: Importer.h:32

StringParser::expectSpaces
size_t expectSpaces(size_t count=1, bool allowMore=true)
Definition: tools.h:67

Importer
Definition: Importer.h:58

tools.h

line
static int line
Definition: arb_a2ps.c:296

SequenceBuffer
Definition: SequenceBuffer.h:70

NULp
#define NULp
Definition: cxxforward.h:116

arb_stdstr.h

DBerror::getMessage
const string & getMessage() const
Definition: DBwriter.h:47

Importer::db_writer
DBwriter & db_writer
Definition: Importer.h:60

BC_ALL
Definition: SequenceBuffer.h:36

FeatureLine::reinterpret_as_continued_line
bool reinterpret_as_continued_line()
Definition: Importer.cxx:105

char

BaseCounter::addCount
void addCount(Base base, size_t amount)
Definition: SequenceBuffer.h:53

StringParser::eatNumber
long eatNumber(bool &eaten)
Definition: tools.h:110

Base
Base
Definition: SequenceBuffer.h:36

FL_CONTINUED
Definition: Importer.h:35

StringParser::getPosition
stringCIter getPosition() const
Definition: tools.h:44

References::start
void start()
Definition: MetaInfo.cxx:46

Importer::import
void import()
Definition: Importer.cxx:234

MT_REF
Definition: MetaTag.h:22

DBerror
Definition: DBwriter.h:33

is_escaped
static bool is_escaped(const string &str, size_t pos)
Definition: Importer.cxx:21

MetaTag::type
MetaTagType type
Definition: MetaTag.h:35

MT_REF_DBID
Definition: MetaTag.h:23

MT_FEATURE
Definition: MetaTag.h:25

MT_CONTIG
Definition: MetaTag.h:27