26 if (str[pos-1] ==
'\\') {
35 string::size_type first_char = line.find_first_not_of(
' ', 5);
39 if (first_char == 5) {
40 string::size_type behind_name = line.find_first_of(
' ', first_char);
41 string::size_type rest_start = line.find_first_not_of(
' ', behind_name);
43 if (rest_start == string::npos) {
44 if (behind_name == string::npos)
throw "Expected space behind feature name";
45 throw "Expected some content behind feature name";
48 name = line.substr(first_char, behind_name-first_char);
49 rest = line.substr(rest_start);
52 else if (first_char >= 21) {
53 if (first_char == 21 && line[first_char] ==
'/') {
54 string::size_type equal_pos = line.find_first_of(
'=', first_char);
55 if (equal_pos == string::npos) {
57 name = line.substr(first_char+1);
62 name = line.substr(first_char+1, equal_pos-first_char-1);
63 rest = line.substr(equal_pos+1);
66 size_t rlen = rest.length();
71 else if (rest[rlen-1] ==
'"' && !
is_escaped(rest, rlen-1)) {
84 interpret_as_continued_line();
88 if (first_char == string::npos) {
89 throw "Expected feature line, found empty line";
91 throw GBS_global_string(
"Expected feature line (first char at pos=%zu unexpected)", first_char);
95 void FeatureLine::interpret_as_continued_line() {
96 rest = orgLine.substr(21);
97 if (rest[rest.length()-1] ==
'"') {
109 string::size_type first_char = orgLine.find_first_not_of(
' ', 5);
110 if (first_char >= 21) {
111 interpret_as_continued_line();
122 : db_writer(DB_writer),
124 tagTranslator(meta_description),
125 expectedSeqLength(-1)
149 if (fline->type &
FL_META_CONTINUED)
throw "Expected start of feature or qualifier";
155 while (!next_fline.
isNull() &&
162 if (!next_fline->reinterpret_as_continued_line()) {
163 throw "did not see end of quoted qualifier (instead found next qualifiert)";
181 fline->rest.append(next_fline->rest);
196 if (fline->type !=
FL_START)
throw "Expected feature start";
198 feature =
new Feature(fline->name, fline->rest);
202 feature->addQualifiedEntry(fline->name, fline->rest);
214 while (!feature.
isNull()) {
216 feature->fixEmptyQualifiers();
224 const char *what = import_of_what.c_str();
225 stringVectorCRIter e =
warnings.rend();
226 for (stringVectorCRIter i =
warnings.rbegin(); i != e; ++i) {
262 warning(
"Sequence data is empty (only metadata found).");
266 warning(
"No 'BASE COUNT' found. Base counts have not been validated.");
285 {
" AUTHORS",
"author",
MT_REF },
286 {
" TITLE",
"title",
MT_REF },
287 {
" CONSRTM",
"refgrp",
MT_REF },
288 {
" JOURNAL",
"journal",
MT_REF },
289 {
" PUBMED",
"pubmed_id",
MT_REF },
290 {
" MEDLINE",
"medline_id",
MT_REF },
291 {
" REMARK",
"refremark",
MT_REF },
293 {
"DEFINITION",
"definition",
MT_BASIC },
298 {
"SOURCE",
"full_name",
MT_BASIC },
316 {
"RA",
"author",
MT_REF },
317 {
"RC",
"auth_comm",
MT_REF },
318 {
"RG",
"refgrp",
MT_REF },
319 {
"RL",
"journal",
MT_REF },
320 {
"RP",
"nuc_rp",
MT_REF },
321 {
"RT",
"title",
MT_REF },
325 {
"AH",
"assembly_header",
MT_BASIC },
326 {
"AS",
"assembly_info",
MT_BASIC },
353 :
Importer(Flatfile, DB_writer, genebank_meta_description)
356 bool GenebankImporter::readFeatureTableLine(
string&
line) {
371 string::size_type first_non_space = line.find_first_not_of(
' ');
373 if (first_non_space >= 12 ||
374 (first_non_space == string::npos && line.length() == 12)) {
376 content = line.substr(12);
382 string::size_type behind_tag = line.find_first_of(
' ', first_non_space);
383 if (behind_tag == string::npos) {
389 string::size_type content_start = line.find_first_not_of(
' ', behind_tag);
390 if (content_start == string::npos) {
394 content = line.substr(content_start);
397 tag = line.substr(0, behind_tag);
413 void GenebankImporter::import_section() {
420 bool seenHeaderLine =
false;
425 string line, tag, content;
432 if (!content.empty()) {
433 prevContent.append(1,
' ');
434 prevContent.append(content);
442 switch (prevTag->
type) {
444 case MT_BASIC: meta.
add(prevTag, prevContent,
true);
break;
446 meta.
add(prevTag, prevContent,
true);
455 switch (knownTag->
type) {
457 if (seenHeaderLine)
throw GBS_global_string(
"Multiple occurrences of tag '%s'", tag.c_str());
458 seenHeaderLine =
true;
463 prevContent = content;
476 parseSequence(knownTag->
tag, content);
506 :
Importer(Flatfile, DB_writer, embl_meta_description)
509 static bool splitEmblTag(
const string& line,
string& tag,
string& content) {
513 if (line.length() == 2) {
518 string::size_type spacer = line.find(
" ");
519 if (spacer != 2)
return false;
521 tag = line.substr(0, 2);
522 content = line.substr(5);
528 bool EmblImporter::readFeatureTableLine(
string& line) {
549 bp = atol(lastWord.c_str());
557 if (bp == -1)
throw "Could not parse bp from header";
562 void EmblImporter::import_section() {
568 bool prevAppendNL =
false;
570 bool seenHeaderLine =
false;
575 string line, tag, content;
578 throw "Expected two-character tag at start of line";
584 if (knownTag == prevTag) {
585 if (prevAppendNL) prevContent.append(
"\n");
586 prevContent.append(content);
590 switch (prevTag->
type) {
593 case MT_BASIC: meta.
add(prevTag, prevContent,
true);
break;
595 meta.
add(prevTag, prevContent,
true);
603 switch (knownTag->
type) {
605 if (seenHeaderLine)
throw GBS_global_string(
"Multiple occurrences of tag '%s'", tag.c_str());
606 seenHeaderLine =
true;
611 prevContent = content;
616 prevContent = content;
631 parseSequence(content);
663 long count = parser.
eatNumber(seen_number);
679 if (expect)
throw GBS_global_string(
"Expected counter '### %s', found '%s'", word, parser.
rest().c_str());
684 void GenebankImporter::parseSequence(
const string& tag,
const string& headerline) {
693 parser.expectContent(
"COUNT");
711 if (!headerCount.
isNull()) {
715 if (!
beginsWith(line,
"ORIGIN"))
throw "Expected 'ORIGIN'";
718 bool eos_seen =
false;
730 size_t cur_pos = (
size_t)parser.extractNumber();
731 size_t datasize = seqData.getBaseCounter().getCount(
BC_ALL);
733 if (cur_pos != (datasize+1)) {
734 throw GBS_global_string(
"Got wrong base position (found=%zu, expected=%zu)", cur_pos,
size_t(datasize+1));
738 while (!parser.atEnd() && parser.at() ==
' ') {
739 parser.expectSpaces(1);
741 stringCIter
start = parser.pos;
742 stringCIter end = parser.find(
' ');
744 data.append(start, end);
748 if (blocks>6)
throw "Found more than 6 parts of sequence data";
749 seqData.addLine(data);
758 void EmblImporter::parseSequence(
const string& headerline) {
764 parser.expectContent(
"Sequence");
773 headerCount.checkOverallCounter();
779 bool eos_seen =
false;
792 parser.expectSpaces(5,
false);
794 while (!parser.atEnd() && isalpha(parser.at())) {
795 stringCIter start = parser.pos;
796 stringCIter end = parser.find(
' ');
798 data.append(start, end);
800 parser.expectSpaces(1);
803 if (blocks>6)
throw "Found more than 6 parts of sequence data";
805 size_t basecount = (
size_t)parser.extractNumber();
807 seqData.addLine(data);
808 size_t datasize = seqData.getBaseCounter().getCount(
BC_ALL);
810 if (basecount != datasize) {
811 throw GBS_global_string(
"Got wrong base counter(found=%zu, expected=%zu)", basecount, datasize);
void backFeatureTableLine(FeatureLinePtr &fline)
void calcOverallCounter()
void show_warnings(const string &import_of_what)
void check_base_counters(const SequenceBuffer &seqData, const BaseCounter *headerCount)
void writeSequence(const SequenceBuffer &seqData)
FeatureLine(const string &line)
bool getLine(string &line)
void writeFeature(const Feature &feature, long seqLength)
bool parseCounter(bool expect, BaseCounter &headerCount, StringParser &parser, Base base, const char *word)
static MetaTag embl_meta_description[]
void expectLine(string &line)
virtual bool readFeatureTableLine(string &line)=0
void add_dbid(const std::string &content)
size_t lookingAt(const char *content)
static bool splitGenebankTag(const string &line, string &tag, string &content)
GenebankImporter(LineReader &Flatfile, DBwriter &DB_writer)
const char * GBS_global_string(const char *templat,...)
void expectEqual(const BaseCounter &other) const
size_t getCount(Base base) const
void expectContent(const char *content)
Importer(LineReader &Flatfile, DBwriter &DB_writer, const MetaTag *meta_description)
bool isNull() const
test if SmartPtr is NULp
void finalizeOrganism(const MetaInfo &meta, const References &refs, Importer &importer)
void warning(const char *msg)
static HelixNrInfo * start
void GB_warningf(const char *templat,...)
FeatureLinePtr getFeatureTableLine()
virtual void import_section()=0
void backLine(const string &line)
string lineError(const string &msg) const
const BaseCounter & getBaseCounter() const
EmblImporter(LineReader &Flatfile, DBwriter &DB_writer)
void createOrganism(const string &flatfile, const char *importerTag)
void advance(size_t offset)
static long scanSeqlenFromLOCUS(const string &locusContent)
FeatureLines pushedFeatureLines
std::string extractWord(const char *delimiter=" ")
static MetaTag genebank_meta_description[]
static long scanSeqlenFromID(const string &idContent)
FeatureLinePtr getUnwrappedFeatureTableLine()
const MetaTag * findTag(const string &tag)
const T * content() const
convert SmartPtr to plain old pointer (also works if isNull())
bool beginsWith(const std::string &str, const std::string &start)
void add(const std::string &field, const std::string &content)
virtual const string & getFilename() const =0
static bool splitEmblTag(const string &line, string &tag, string &content)
FeaturePtr parseFeature()
void setPosition(const stringCIter &position)
size_t expectSpaces(size_t count=1, bool allowMore=true)
const string & getMessage() const
bool reinterpret_as_continued_line()
void addCount(Base base, size_t amount)
long eatNumber(bool &eaten)
stringCIter getPosition() const
static bool is_escaped(const string &str, size_t pos)