23 #define MAX_SEQUENCE_PER_MASTER 50 // was 18 till May 2008
66 unsigned char *
con[256];
82 Consensus *gcon = ARB_calloc<Consensus>(1);
83 unsigned char *data = ARB_calloc<unsigned char>(256*len);
87 for (
int i=0; i<256; i++) {
88 gcon->
con[i] = data + len*i;
104 if (seq_len > gcon->
len) seq_len = gcon->
len;
107 unsigned char *
s =
seq;
113 for (li = i = 0; i < seq_len; i++) {
122 unsigned char *p = gcon->
con[last];
127 while (li < i) p[li++] += c;
132 while (li < i) p[li++] += c;
135 while (li < i) p[li++] |= 1;
152 unsigned char *
max = ARB_calloc<unsigned char>(gcon->
len);
153 char *
seq = ARB_calloc<char>(gcon->
len+1);
155 memset(seq,
'@', gcon->
len);
157 for (
int c = 1; c<256; c++) {
158 if (!gcon->
used[c])
continue;
160 for (pos = 0; pos<gcon->
len; pos++) {
174 if (node->is_leaf())
return 1;
175 node->gb_node =
NULp;
180 if (ctree->is_leaf()) {
181 if (ctree->index >= 0) {
187 else if (ctree->index<0) {
199 if (node->is_leaf()) {
200 if (node->index >= 0) {
204 seqs[node->index].
master = my_master;
208 if (progress.
aborted())
return;
210 if (node->index>=0) {
211 masters[node->index]->
master = my_master;
212 my_master = node->index;
214 g_b_create_master(node->get_leftson(), seqs, masters, my_master, ali_name, seq_len, progress);
215 g_b_create_master(node->get_rightson(), seqs, masters, my_master, ali_name, seq_len, progress);
216 if (node->index>=0 && !progress.
aborted()) {
241 node->sons -= subtract;
242 node = node->get_father();
247 if (!node->is_leaf()) {
248 if (node->sons == wantedSons) {
251 node->index = *mcount;
257 else if (node->sons>wantedSons) {
261 int maxSons = lMax<rMax ? rMax : lMax;
263 maxSons = node->sons;
272 if (node->is_leaf()) {
279 #if defined(SAVE_COMPRESSION_TREE_TO_DB)
280 freenull(node->name);
281 if (node->index2 != -1) {
284 #endif // SAVE_COMPRESSION_TREE_TO_DB
286 return (left>right ? left : right) + (node->index == -1 ? 0 : 1);
290 if (node->is_leaf()) {
296 node->index = *scount;
312 while (wantedSons >= 2) {
314 wantedSons = maxSons;
324 #define MAX_NUMBER 0x7fffffff
338 result = c4 | (c3<<8) | (c2<<16) | (c1<<24);
341 result = c3 | (c2<<8) | (c1<<16) | ((c0 & 0x0f)<<24);
345 result = c2 | (c1<<8) | ((c0 & 0x1f)<<16);
349 result = c1 | ((c0 & 0x3f)<<8);
368 j = (i>>8) | 0x80; *s++ = j;
371 else if (i<0x200000) {
372 j = (i>>16) | 0xC0; *s++ = j;
373 j = (i>>8); *s++ = j;
376 else if (i<0x10000000) {
377 j = (i>>24) | 0xE0; *s++ = j;
378 j = (i>>16); *s++ = j;
379 j = (i>>8); *s++ = j;
384 j = (i>>24); *s++ = j;
385 j = (i>>16); *s++ = j;
386 j = (i>>8); *s++ = j;
405 unsigned char INIT = 0xaa;
406 memset(buffer, INIT, BUFSIZE);
411 unsigned char *bp =
buffer;
414 size_t bytes_written = bp-
buffer;
417 if (buffer_expected) {
422 const unsigned char *bp =
buffer;
427 size_t bytes_read = bp-
buffer;
433 return all().ofgroup(expected);
436 #define TEST_PUT_READ_NUMBER(num,expect_bytes) TEST_EXPECTATION(put_read_num_using_bytes(num, expect_bytes))
437 #define TEST_PUT_READ_NUMBER__BROKEN(num,expect_bytes) TEST_EXPECTATION__BROKEN(put_read_num_using_bytes(num, expect_bytes))
439 #define TEST_PUT_NUMBER_BINARY1(num, byte1) do { \
440 unsigned char buf[1]; \
442 TEST_EXPECTATION(put_read_num_using_bytes(num, 1, buf)); \
445 #define TEST_PUT_NUMBER_BINARY2(num, byte1, byte2) do { \
446 unsigned char buf[2]; \
449 TEST_EXPECTATION(put_read_num_using_bytes(num, 2, buf)); \
452 #define TEST_PUT_NUMBER_BINARY3(num, byte1, byte2, byte3) do { \
453 unsigned char buf[3]; \
457 TEST_EXPECTATION(put_read_num_using_bytes(num, 3, buf)); \
460 #define TEST_PUT_NUMBER_BINARY4(num, byte1, byte2, byte3, byte4) do { \
461 unsigned char buf[4]; \
466 TEST_EXPECTATION(put_read_num_using_bytes(num, 4, buf)); \
469 #define TEST_PUT_NUMBER_BINARY5(num, byte1, byte2, byte3, byte4, byte5) do { \
470 unsigned char buf[5]; \
476 TEST_EXPECTATION(put_read_num_using_bytes(num, 5, buf)); \
479 void TEST_put_read_number() {
481 TEST_PUT_READ_NUMBER(0x0, 1);
483 TEST_PUT_READ_NUMBER(0x7f, 1);
484 TEST_PUT_READ_NUMBER(0x80, 2);
486 TEST_PUT_READ_NUMBER(0x3fff, 2);
487 TEST_PUT_READ_NUMBER(0x4000, 3);
489 TEST_PUT_READ_NUMBER(0x1fffff, 3);
490 TEST_PUT_READ_NUMBER(0x200000, 4);
492 TEST_PUT_READ_NUMBER(0xfffffff, 4);
494 TEST_PUT_READ_NUMBER(0x10000000, 5);
495 TEST_PUT_READ_NUMBER(0x7fffffff, 5);
500 TEST_PUT_NUMBER_BINARY1(0x0, 0x00);
501 TEST_PUT_NUMBER_BINARY1(0x7f, 0x7f);
503 TEST_PUT_NUMBER_BINARY2(0x80, 0x80, 0x80);
504 TEST_PUT_NUMBER_BINARY2(0x81, 0x80, 0x81);
505 TEST_PUT_NUMBER_BINARY2(0x3fff, 0xbf, 0xff);
507 TEST_PUT_NUMBER_BINARY3(0x4000, 0xc0, 0x40, 0x00);
508 TEST_PUT_NUMBER_BINARY3(0x1fffff, 0xdf, 0xff, 0xff);
510 TEST_PUT_NUMBER_BINARY4(0x200000, 0xe0, 0x20, 0x00, 0x00);
511 TEST_PUT_NUMBER_BINARY4(0xfffffff, 0xef, 0xff, 0xff, 0xff);
513 TEST_PUT_NUMBER_BINARY5(0x10000000, 0xf0, 0x10, 0x00, 0x00, 0x00);
514 TEST_PUT_NUMBER_BINARY5(0x7fffffff, 0xf0, 0x7f, 0xff, 0xff, 0xff);
524 size_t *memsize,
int old_flag) {
534 if (seq_len > master_len) {
535 rest = seq_len - master_len;
540 for (i = len; i>0; i--) {
543 if (cm==cs && cs != last) {
552 for (i = rest; i>0; i--) {
557 unsigned char *buffer2;
558 unsigned char *dest2;
567 *memsize = *memsize + (dest2-buffer2);
568 return (
char *)buffer2;
573 GBQUARK q,
const char *
seq,
size_t seq_len,
size_t *memsize)
595 error =
"Tree is empty";
598 arb_progress tree_progress(
"Compressing sequences", 4L);
601 long mastercount = 0;
602 int max_compSteps = 0;
608 "Skipping compression for this alignment",
614 #if defined(SAVE_COMPRESSION_TREE_TO_DB)
617 if (!error) error =
GBT_write_tree(gb_main, 0,
"tree_compression_new", tree);
618 GB_information(
"Only generated compression tree (do NOT save DB anymore)");
621 #endif // SAVE_COMPRESSION_TREE_TO_DB
626 int min_compSteps = 1;
636 long acceptable_masters = (3*min_masters)/2;
637 int acceptable_compSteps = 11*min_compSteps;
639 if (mastercount>acceptable_masters || max_compSteps>acceptable_compSteps) {
640 GB_warningf(
"Tree is ill-suited for compression (cause of deep branches)\n"
641 " Used tree Optimal tree Overhead\n"
642 "Compression steps %5i %5i %4i%% (speed)\n"
643 "Master sequences %5li %5li %4li%% (size)\n"
644 "If you like to restart with a better tree,\n"
645 "press 'Abort' to stop compression",
646 max_compSteps, min_compSteps, (100*max_compSteps)/min_compSteps-100,
647 mastercount, min_masters, (100*mastercount)/min_masters-100);
660 unsigned long long sumorg = 0;
661 unsigned long long sumold = 0;
662 unsigned long long sumnew = 0;
669 free(masterfoldername);
685 free(master_data_name);
687 for (
long si = 0; si<mastercount; si++) {
694 arb_progress progress(
"Building master sequences", mastercount);
704 arb_progress progress(
"Compressing sequences in tree", seqcount);
706 for (
long si=0; si<seqcount && !
error; si++) {
712 GB_warning(
"A species seems to be more than once in the tree");
742 long speciesNotInTree = 0;
746 for (pass = 1; pass <= 2; ++pass) {
780 speciesNotInTree = count;
781 if (speciesNotInTree>0) {
782 progress =
new arb_progress(
"Compressing sequences NOT in tree", speciesNotInTree);
793 arb_progress progress(
"Compressing master-sequences", mastercount);
796 for (
long si=0; si<mastercount; si++) {
797 int mi = masters[si]->
master;
806 error =
GB_export_error(
"Sequence Compression: Master Index Conflict");
839 for (gb_omaster =
GB_entry(old_gb_master_ali,
"@master");
854 " Uncompressed data: %7s\n"
855 " Old compressed data: %7s = %6.2f%%\n"
856 " New compressed data: %7s = %6.2f%%",
858 sizeOld, (100.0*sumold)/sumorg,
859 sizeNew, (100.0*sumnew)/sumorg);
869 if (old_gb_master_ali) error =
GB_delete(old_gb_master_ali);
875 for (
long si=0; si<mastercount; si++) free(masters[si]);
879 tree_progress.
done();
895 error =
"Compress Sequences called while transaction running";
908 if (!tree_name || !strlen(tree_name)) {
923 error = ta.
close(error);
927 #if defined(SAVE_COMPRESSION_TREE_TO_DB)
928 error =
"fake error";
929 #endif // SAVE_COMPRESSION_TREE_TO_DB
946 printf(
"Recompression data in alignment '%s' using tree '%s'\n", ali_name, tree_name);
960 const signed char *source = (
signed char *)s;
962 const char *m = master;
985 j = (*source++) & 0xff;
986 j |= (
static_cast<unsigned char>(*source++) << 8) & 0xff00;
1002 memset(dest, c, -j);
1025 *error =
"Can not uncompress this sequence (neither has father nor inside callback)";
1034 const unsigned char *
s = (
const unsigned char *)ss;
1039 ss = (
const char *)s;
1047 *error =
"Cannot uncompress this sequence: Cannot find a master sequence";
1076 void TEST_SLOW_sequence_compression() {
1077 const char *source =
"TEST_nuc.arb";
1078 const char *compressed =
"TEST_nuc_seqcompr.arb";
1079 const char *expected =
"TEST_nuc_seqcompr_exp.arb";
1080 const char *aliname =
"ali_16s";
1084 const int SEQ2COMPARE = 7;
1085 char *seq_exp[SEQ2COMPARE];
1089 TEST_EXPECT_RESULT__NOERROREXPORTED(gb_main =
GB_open(source,
"rw"));
1096 gb_species && count<SEQ2COMPARE;
1108 #if defined(TEST_AUTO_UPDATE)
1109 TEST_COPY_FILE(compressed, expected);
1115 TEST_EXPECT_RESULT__NOERROREXPORTED(gb_main =
GB_open(compressed,
"rw"));
1121 gb_species && count<SEQ2COMPARE;
1129 freenull(seq_exp[count]);
1140 #endif // UNIT_TESTS
GB_ERROR GB_begin_transaction(GBDATA *gbd)
static char * gb_compress_seq_by_master(const char *master, size_t master_len, int master_index, GBQUARK q, const char *seq, size_t seq_len, size_t *memsize, int old_flag)
GB_MAIN_TYPE * gb_get_main_during_cb()
void compute_tree() OVERRIDE
GBDATA * GB_open(const char *path, const char *opent)
void GB_warning(const char *message)
void GBT_compression_test(struct Unfixed_cb_parameter *, GBDATA *gb_main)
static char * gb_compress_sequence_by_master(GBDATA *gbd, const char *master, size_t master_len, int master_index, GBQUARK q, const char *seq, size_t seq_len, size_t *memsize)
GB_BUFFER gb_compress_data(GBDATA *gbd, int key, GB_CBUFFER source, size_t size, size_t *msize, GB_COMPRESSION_MASK max_compr, bool pre_compressed)
GB_ERROR GB_write_string(GBDATA *gbd, const char *s)
static Consensus * g_b_new_Consensus(long len)
#define ASSERT_NO_ERROR(errorExpr)
GB_MAIN_TYPE * GB_MAIN(GBDATA *gbd)
GBDATA * GB_nextEntry(GBDATA *entry)
GB_ERROR GB_end_transaction(GBDATA *gbd, GB_ERROR error)
static void g_b_Consensus_add(Consensus *gcon, unsigned char *seq, long seq_len)
void GB_disable_quicksave(GBDATA *gbd, const char *reason)
static char * g_b_uncompress_single_sequence_by_master(const char *s, const char *master, size_t size, size_t *new_size)
const char * GBT_name_of_largest_tree(GBDATA *gb_main)
TreeNode * makeNode() const OVERRIDE
GB_ERROR GBT_compress_sequence_tree2(GBDATA *gbd, const char *tree_name, const char *ali_name)
GBCONTAINER * gb_create_container(GBCONTAINER *father, const char *key)
char * ARB_strdup(const char *str)
TreeNode * GBT_read_tree(GBDATA *gb_main, const char *tree_name, TreeRoot *troot)
GBCONTAINER * gb_master_ali
const char * GBS_global_string(const char *templat,...)
void warning(int warning_num, const char *warning_message)
long GBT_get_alignment_len(GBDATA *gb_main, const char *aliname)
char * GB_check_out_buffer(GB_CBUFFER buffer)
GBQUARK gb_find_or_create_quark(GB_MAIN_TYPE *Main, const char *key)
int GB_unlink(const char *path)
char * gb_uncompress_by_sequence(GBDATA *gbd, const char *ss, size_t size, GB_ERROR *error, size_t *new_size)
static int init_indices_and_count_sons(CompressionTree *node, long *scount, const char *ali_name)
GBCONTAINER * root_container
GB_ERROR GBT_link_tree(TreeNode *tree, GBDATA *gb_main, bool show_status, int *zombies, int *duplicates)
char buffer[MESSAGE_BUFFERSIZE]
static void subtract_sons_from_tree(CompressionTree *node, int subtract)
#define DOWNCAST(totype, expr)
GB_ERROR error_if_aborted()
GBENTRY * gb_create(GBCONTAINER *father, const char *key, GB_TYPES type)
GB_ERROR GB_delete(GBDATA *&source)
GBDATA * GBT_first_species_rel_species_data(GBDATA *gb_species_data)
GB_BUFFER GB_give_other_buffer(GB_CBUFFER buffer, long size)
#define TEST_PUBLISH(testfunction)
GB_ERROR GB_export_error(const char *error)
size_t GB_read_string_count(GBDATA *gbd)
GB_ERROR GB_await_error()
#define MAX_SEQUENCE_PER_MASTER
void destroyNode(TreeNode *node) const OVERRIDE
char * GBT_read_string(GBDATA *gb_container, const char *fieldpath)
void GB_warningf(const char *templat,...)
bool isSet() const
test if SmartPtr is not NULp
long GB_read_memuse(GBDATA *gbd)
static void distribute_masters(CompressionTree *tree, long *mcount, int *max_masters)
static void g_b_create_master(CompressionTree *node, Sequence *seqs, MasterSequence **masters, int my_master, const char *ali_name, long seq_len, arb_progress &progress)
GB_ERROR GB_save_as(GBDATA *gbd, const char *path, const char *savetype)
GB_ERROR GBT_write_tree(GBDATA *gb_main, const char *tree_name, TreeNode *tree)
const char * GBS_readable_size(unsigned long long size, const char *unit_suffix)
STATIC_ASSERT(MAX_NUMBER<=INT_MAX)
static void error(const char *msg)
static int maxCompressionSteps(CompressionTree *node)
expectation_group & add(const expectation &e)
GBDATA * gb_find_by_nr(GBCONTAINER *father, int index)
int g_b_read_number2(const unsigned char *&s)
GBCONTAINER * as_container() const
#define TEST_EXPECT_ZERO_OR_SHOW_ERRNO(iocond)
CONSTEXPR_INLINE GBCONTAINER * GB_FATHER(GBDATA *gbd)
GB_ERROR GB_write_security_write(GBDATA *gbd, unsigned long level)
GBDATA * GBT_find_sequence(GBDATA *gb_species, const char *aliname)
~CompressionTree() OVERRIDE
int gb_read_nr(GBDATA *gbd)
int get_transaction_level() const
void gb_compress_equal_bytes_2(const char *source, size_t size, size_t *msize, char *dest)
static GB_ERROR compress_sequence_tree(GBCONTAINER *gb_main, CompressionTree *tree, const char *ali_name)
static int g_b_count_leafs(CompressionTree *node)
void GB_internal_error(const char *message)
static void g_b_put_sequences_in_container(CompressionTree *ctree, Sequence *seqs, MasterSequence **masters, Consensus *gcon)
TYPE * ARB_calloc(size_t nelem)
void g_b_put_number2(int i, unsigned char *&s)
GB_ERROR close(GB_ERROR error)
GB_UNDO_TYPE GB_get_requested_undo_type(GBDATA *gb_main)
static void g_b_delete_Consensus(Consensus *gcon)
#define memory_is_equal(m1, m2, size)
#define GB_RUNLENGTH_SIZE
#define TEST_EXPECT_FILES_EQUAL(f1, f2)
GB_ERROR gb_write_compressed_pntr(GBENTRY *gbe, const char *s, long memsize, long stored_size)
char * GB_read_string(GBDATA *gbd)
static int set_masters_with_sons(CompressionTree *node, int wantedSons, long *mcount)
GBDATA * GBT_first_species(GBDATA *gb_main)
GB_ERROR GB_write_security_delete(GBDATA *gbd, unsigned long level)
#define TEST_EXPECT_NO_ERROR(call)
CompressionTree(CompressionRoot *croot)
GBENTRY * as_entry() const
long GB_read_clock(GBDATA *gbd)
GBDATA * GBT_next_species(GBDATA *gb_species)
void GB_information(const char *message)
static char * g_b_Consensus_get_sequence(Consensus *gcon)
GBDATA * gb_search(GBCONTAINER *gbc, const char *key, GB_TYPES create, int internflag)
char * GBT_get_default_alignment(GBDATA *gb_main)
#define DEFINE_TREE_ACCESSORS(RootType, TreeType)
GB_ERROR GB_request_undo_type(GBDATA *gb_main, GB_UNDO_TYPE type) __ATTR__USERESULT_TODO
GB_transaction ta(gb_var)
void destroy(TreeNode *that)
GB_CSTR GB_read_char_pntr(GBDATA *gbd)
void gb_load_single_key_data(GBDATA *gb_main, GBQUARK q)
GBDATA * GB_search(GBDATA *gbd, const char *fieldpath, GB_TYPES create)
#define TEST_EXPECT_EQUAL(expr, want)
unsigned get_leaf_count() const OVERRIDE
GBDATA * GB_entry(GBDATA *father, const char *key)
void inc_and_check_user_abort(GB_ERROR &error)
char * GBS_global_string_copy(const char *templat,...)
void GB_close(GBDATA *gbd)
GBDATA * GBT_get_species_data(GBDATA *gb_main)
GB_write_int const char s