ARB
adoptimize.cxx
Go to the documentation of this file.
1 // =============================================================== //
2 // //
3 // File : adoptimize.cxx //
4 // Purpose : //
5 // //
6 // Institute of Microbiology (Technical University Munich) //
7 // http://www.arb-home.de/ //
8 // //
9 // =============================================================== //
10 
11 #include <climits>
12 #include <netinet/in.h>
13 
14 #include <arb_file.h>
15 #include <arb_diff.h>
16 
17 #include <arbdbt.h>
18 
19 #include "gb_key.h"
20 #include "gb_compress.h"
21 #include "gb_dict.h"
22 
23 #include "arb_progress.h"
24 
25 #if defined(DEBUG)
26 // #define TEST_DICT
27 #endif // DEBUG
28 
29 typedef unsigned char unsigned_char;
30 typedef unsigned char *u_str;
31 typedef const unsigned char *cu_str;
32 
33 static int gbdByKey_cnt;
34 struct O_gbdByKey { // one for each diff. keyQuark
35  int cnt;
36  GBDATA **gbds; // gbdoff
37 };
38 
39 struct FullDictTree;
40 struct SingleDictTree;
41 
42 union DictTree {
45  void *exists;
46 
47 };
48 
50 
51 struct FullDictTree {
52  DictNodeType typ; // always FULL_NODE
53  int usedSons;
54  int count[256];
55  DictTree son[256]; // index == character
56 };
57 
59  DictNodeType typ; // always SINGLE_NODE
60  unsigned_char ch; // the character
61  int count; // no of occurrences of this branch
64 
65 };
66 
67 // **************************************************
68 
69 #define COMPRESSIBLE(type) ((type) >= GB_BYTES && (type)<=GB_STRING)
70 #define DICT_MEM_WEIGHT 4
71 
72 #define WORD_HELPFUL(wordlen, occurrences) ((long)((occurrences)*3 + DICT_MEM_WEIGHT*(2*sizeof(GB_NINT)+(wordlen))) \
73  < \
74  (long)((occurrences)*(wordlen)))
75 /* (occurrences)*4 compressed size
76  * 2*sizeof(GB_NINT)+(wordlen) size in dictionary
77  * (occurrences)*(wordlen) uncompressed size
78  */
79 
80 // **************************************************
81 
82 #define MIN_WORD_LEN 8 // minimum length of words in dictionary
83 #define MAX_WORD_LEN 50 // maximum length of words in dictionary
84 #define MAX_BROTHERS 10 /* maximum no of brothers linked with SingleDictTree
85  * above we use FullDictTree */
86 #define MAX_DIFFER 2 /* percentage of difference (of occurrences of strings) below which two
87  * consecutive parts are treated as EQUAL # of occurrences */
88 #define INCR_DIFFER 1 // the above percentage is incremented from 0 to MAX_DIFFER by INCR_DIFFER per step
89 
90 #define DICT_STRING_INCR 1024 // dictionary string will be incremented by this size
91 
92 // ******************* Tool functions ******************
93 
94 inline cu_str get_data_n_size(GBDATA *gbd, size_t *size) {
95  GB_CSTR data;
96  *size = 0;
97 
98  switch (gbd->type()) {
99  case GB_STRING: data = GB_read_char_pntr(gbd); break;
100  case GB_BYTES: data = GB_read_bytes_pntr(gbd); break;
101  case GB_INTS: data = (char*)GB_read_ints_pntr(gbd); break;
102  case GB_FLOATS: data = (char*)GB_read_floats_pntr(gbd); break;
103  default:
104  data = NULp;
105  gb_assert(0);
106  break;
107  }
108 
109  if (data) *size = gbd->as_entry()->uncompressed_size();
110  return (cu_str)data;
111 }
112 
113 static inline long min(long a, long b) {
114  return a<b ? a : b;
115 }
116 
117 // **************************************************
118 
119 static void g_b_opti_scanGbdByKey(GB_MAIN_TYPE *Main, GBDATA *gbd, O_gbdByKey *gbk) {
120  if (gbd->is_container()) {
121  GBCONTAINER *gbc = gbd->as_container();
122  for (int idx=0; idx < gbc->d.nheader; idx++) {
123  GBDATA *gbd2 = GBCONTAINER_ELEM(gbc, idx);
124  if (gbd2) g_b_opti_scanGbdByKey(Main, gbd2, gbk);
125  }
126  }
127 
128  GBQUARK quark = GB_KEY_QUARK(gbd);
129  if (quark) {
130  gb_assert(gbk[quark].cnt < Main->keys[quark].nref || quark==0);
131  gb_assert(gbk[quark].gbds);
132 
133  gbk[quark].gbds[gbk[quark].cnt] = gbd;
134  gbk[quark].cnt++;
135  }
136 }
137 
139  O_gbdByKey *gbk = ARB_calloc<O_gbdByKey>(Main->keycnt);
140 
141  gbdByKey_cnt = Main->keycnt; // always use gbdByKey_cnt instead of Main->keycnt cause Main->keycnt can change
142 
143  for (int idx=1; idx<gbdByKey_cnt; idx++) {
144  gbk[idx].cnt = 0;
145 
146  gb_Key& KEY = Main->keys[idx];
147  if (KEY.key && KEY.nref>0) {
148  ARB_calloc(gbk[idx].gbds, KEY.nref);
149  }
150  else {
151  gbk[idx].gbds = NULp;
152  }
153  }
154 
155  gbk[0].cnt = 0;
156  ARB_calloc(gbk[0].gbds, 1);
157 
158  g_b_opti_scanGbdByKey(Main, Main->gb_main(), gbk);
159 
160  for (int idx=0; idx<gbdByKey_cnt; idx++) {
161  if (gbk[idx].cnt != Main->keys[idx].nref && idx) {
162  printf("idx=%i gbk[idx].cnt=%i Main->keys[idx].nref=%li\n", // Main->keys[].nref ist falsch
163  idx, gbk[idx].cnt, Main->keys[idx].nref);
164 
165  Main->keys[idx].nref = gbk[idx].cnt;
166  }
167  }
168  return gbk;
169 }
170 
172  for (int idx=0; idx<gbdByKey_cnt; idx++) free(gbk[idx].gbds);
173  free(gbk);
174 }
175 
176 // ******************* Convert old compression style to new style ******************
177 
179  GB_ERROR error = NULp;
180 
181  if (gbd->is_container()) {
182  for (GBDATA *gb_child = GB_child(gbd); gb_child; gb_child = GB_nextChild(gb_child)) {
183  if (gb_child->flags.compressed_data || gb_child->is_container()) {
184  error = gb_convert_compression(gb_child);
185  if (error) break;
186  }
187  }
188  }
189  else {
190  char *str = NULp;
191  GBENTRY *gbe = gbd->as_entry();
192  long elems = gbe->size();
193  size_t data_size = gbe->uncompressed_size();
194  size_t new_size = -1;
195  int expectData = 1;
196 
197  switch (gbd->type()) {
198  case GB_STRING:
199  case GB_OBSOLETE:
200  case GB_BYTES:
201  str = gb_uncompress_bytes(gbe->data(), data_size, &new_size);
202  if (str) {
203  gb_assert(new_size == data_size);
204  str = GB_memdup(str, data_size);
205  }
206  break;
207 
208  case GB_INTS:
209  case GB_FLOATS:
210  str = gb_uncompress_longs_old(gbe->data(), elems, &new_size);
211  if (str) {
212  gb_assert(new_size == data_size);
213  str = GB_memdup(str, data_size);
214  }
215  break;
216 
217  default:
218  expectData = 0;
219  break;
220  }
221 
222  if (!str) {
223  if (expectData) {
224  error = GBS_global_string("Can't read old data to convert compression (Reason: %s)", GB_await_error());
225  }
226  }
227  else {
228  switch (gbd->type()) {
229  case GB_STRING:
230  error = GB_write_string(gbe, "");
231  if (!error) error = GB_write_string(gbe, str);
232  break;
233 
234  case GB_OBSOLETE:
235  error = GB_set_temporary(gbe); // exclude obsolete type from next save
236  break;
237 
238  case GB_BYTES:
239  error = GB_write_bytes(gbe, "", 0);
240  if (!error) error = GB_write_bytes(gbe, str, data_size);
241  break;
242 
243  case GB_INTS:
244  case GB_FLOATS:
245  error = GB_write_pntr(gbe, str, data_size, elems);
246  break;
247 
248  default:
249  gb_assert(0);
250  break;
251  }
252 
253  free(str);
254  }
255  }
256  return error;
257 }
258 
260  GB_ERROR error = NULp;
261  GBDATA *gb_system = GB_search(gb_main, GB_SYSTEM_FOLDER, GB_FIND);
262 
263  if (!gb_system) {
265  if (GB_entry(gb_main, "extended_data")) {
266  GB_warning("Converting data from old V2.0 to V2.1 Format:\n"
267  " Please Wait (may take some time)");
268  }
269  error = gb_convert_compression(gb_main);
270  GB_disable_quicksave(gb_main, "Database converted to new format");
271  }
272  return error;
273 }
274 
275 
276 // ********************* Compress by dictionary ********************
277 
278 
279 /* compression tag format:
280  *
281  * unsigned int compressed:1;
282  * if compressed==0:
283  * unsigned int last:1; ==1 -> this is the last block
284  * unsigned int len:6; length of uncompressible bytes
285  * char[len];
286  * if compressed==1:
287  * unsigned int idxlen:1; ==0 -> 10-bit index
288  * ==1 -> 18-bit index
289  * unsigned int idxhigh:2; the 2 highest bits of the index
290  * unsigned int len:4; (length of word) - (MIN_COMPR_WORD_LEN-1)
291  * if len==0:
292  * char extralen; (length of word) -
293  * char[idxlen+1]; index (low,high)
294  *
295  * tag == 64 -> end of dictionary compressed block (if not coded in last uncompressed block)
296  */
297 
298 inline int INDEX_DICT_OFFSET(int idx, GB_DICTIONARY *dict) {
299  gb_assert(idx<dict->words);
300  return ntohl(dict->offsets[idx]);
301 }
302 inline int ALPHA_DICT_OFFSET(int idx, GB_DICTIONARY *dict) {
303  int realIndex;
304  gb_assert(idx<dict->words);
305  realIndex = ntohl(dict->resort[idx]);
306  return INDEX_DICT_OFFSET(realIndex, dict);
307 }
308 
309 // #define ALPHA_DICT_OFFSET(i) ntohl(offset[ntohl(resort[i])])
310 // #define INDEX_DICT_OFFSET(i) ntohl(offset[i])
311 
312 #define LEN_BITS 4
313 #define INDEX_BITS 2
314 #define INDEX_LEN_BITS 1
315 
316 #define LEN_SHIFT 0
317 #define INDEX_SHIFT (LEN_SHIFT+LEN_BITS)
318 #define INDEX_LEN_SHIFT (INDEX_SHIFT+INDEX_BITS)
319 
320 #define BITMASK(bits) ((1<<(bits))-1)
321 #define GETVAL(tag,typ) (((tag)>>typ##_SHIFT)&BITMASK(typ##_BITS))
322 
323 #define MIN_SHORTLEN 6
324 #define MAX_SHORTLEN (BITMASK(LEN_BITS)+MIN_SHORTLEN-1)
325 #define MIN_LONGLEN (MAX_SHORTLEN+1)
326 #define MAX_LONGLEN (MIN_LONGLEN+255)
327 
328 #define SHORTLEN_DECR (MIN_SHORTLEN-1) // !! zero is used as flag for long len !!
329 #define LONGLEN_DECR MIN_LONGLEN
330 
331 #define MIN_COMPR_WORD_LEN MIN_SHORTLEN
332 #define MAX_COMPR_WORD_LEN MAX_LONGLEN
333 
334 #define MAX_SHORT_INDEX BITMASK(INDEX_BITS+8)
335 #define MAX_LONG_INDEX BITMASK(INDEX_BITS+16)
336 
337 #define LAST_COMPRESSED_BIT 64
338 
339 #ifdef DEBUG
340 # define DUMP_COMPRESSION_TEST 0
341 /* 0 = only compression ratio
342  * 1 = + original/compressed/decompressed
343  * 2 = + words used to compress/uncompress
344  * 3 = + matching words in dictionary
345  * 4 = + search of words in dictionary
346  */
347 #else
348 # define DUMP_COMPRESSION_TEST 0
349 #endif
350 
351 #ifdef DEBUG
352 // #define COUNT_CHUNKS
353 
354 #if defined(COUNT_CHUNKS)
355 
356 static long uncompressedBlocks[64];
357 static long compressedBlocks[MAX_LONGLEN];
358 
359 static void clearChunkCounters() {
360  int i;
361 
362  for (i=0; i<64; i++) uncompressedBlocks[i] = 0;
363  for (i=0; i<MAX_LONGLEN; i++) compressedBlocks[i] = 0;
364 }
365 
366 static void dumpChunkCounters() {
367  int i;
368 
369  printf("------------------------------\n" "Uncompressed blocks used:\n");
370  for (i=0; i<64; i++) if (uncompressedBlocks[i]) printf(" size=%i used=%li\n", i, uncompressedBlocks[i]);
371  printf("------------------------------\n" "Words used:\n");
372  for (i=0; i<MAX_LONGLEN; i++) if (compressedBlocks[i]) printf(" size=%i used=%li\n", i, compressedBlocks[i]);
373  printf("------------------------------\n");
374 }
375 #endif // COUNT_CHUNKS
376 
377 static cu_str lstr(cu_str s, int len) {
378 #define BUFLEN 20000
379  static unsigned_char buf[BUFLEN];
380 
381  gb_assert(len<BUFLEN);
382  memcpy(buf, s, len);
383  buf[len] = 0;
384 
385  return buf;
386 }
387 
388 #if DUMP_COMPRESSION_TEST>=2
389 
390 static cu_str dict_word(GB_DICTIONARY *dict, int idx, int len) {
391  return lstr(dict->text+INDEX_DICT_OFFSET(idx, dict), len);
392 }
393 
394 #endif
395 
396 #if DUMP_COMPRESSION_TEST>=1
397 
398 static void dumpBinary(u_str data, long size) {
399 #define PER_LINE 12
400  int cnt = 0;
401 
402  while (size--) {
403  unsigned_char c = *data++;
404  int bitval = 128;
405  int bits = 8;
406 
407  while (bits--) {
408  putchar(c&bitval ? '1' : '0');
409  bitval>>=1;
410  }
411  putchar(' ');
412 
413  cnt = (cnt+1)%PER_LINE;
414  if (!cnt) putchar('\n');
415  }
416 
417  if (cnt) putchar('\n');
418 }
419 
420 #endif
421 
422 #endif
423 
424 inline int GB_MEMCMP(const void *vm1, const void *vm2, long size) {
425  char *c1 = (char*)vm1,
426  *c2 = (char*)vm2;
427  int diff = 0;
428 
429  while (size-- && !diff) diff = *c1++-*c2++;
430  return diff;
431 }
432 
433 // --------------------------------------------------
434 
435 static int searchWord(GB_DICTIONARY *dict, cu_str source, long size, unsigned long *wordIndex, int *wordLen) {
436  int idx = -1;
437  int l = 0;
438  int h = dict->words-1;
439  cu_str text = dict->text;
440  GB_NINT *resort = dict->resort;
441  int dsize = dict->textlen;
442  int ilen = 0;
443 
444  while (l<h-1) {
445  int m = (l+h)/2;
446  long off = ALPHA_DICT_OFFSET(m, dict);
447  cu_str dictword = text+off;
448  long msize = min(size, dsize-off);
449 
450 #if DUMP_COMPRESSION_TEST>=4
451  printf(" %s (%i)\n", lstr(dictword, 20), m);
452 #endif
453 
454  if (GB_MEMCMP(source, dictword, msize)<=0) h = m;
455  else l = m;
456  }
457 
458  while (l<=h) {
459  int off = ALPHA_DICT_OFFSET(l, dict);
460  cu_str word = text+off;
461  int msize = (int)min(size, dsize-off);
462  int equal = 0;
463  cu_str s = source;
464 
465  while (msize-- && *s++==*word++) equal++;
466 
467 #if DUMP_COMPRESSION_TEST>=3
468  if (equal>=MIN_COMPR_WORD_LEN) {
469  printf(" EQUAL=%i '%s' (%i->%i, off=%i)", equal, lstr(text+off, equal), l, ntohl(resort[l]), ALPHA_DICT_OFFSET(l, dict));
470  printf(" (context=%s)\n", lstr(text+off-min(off, 20), min(off, 20)+equal+20));
471  }
472 #endif
473 
474  if (equal>ilen) {
475  ilen = equal;
476  idx = ntohl(resort[l]);
477  gb_assert(idx<dict->words);
478  }
479 
480  l++;
481  }
482 
483  *wordIndex = idx;
484  *wordLen = (int)min(ilen, MAX_COMPR_WORD_LEN);
485 
486  return idx!=-1 && ilen>=MIN_COMPR_WORD_LEN;
487 }
488 
489 #ifdef DEBUG
490 int lookup_DICTIONARY(GB_DICTIONARY *dict, GB_CSTR source) { // used for debugging
491  unsigned long wordIndex;
492  int wordLen;
493  int wordFound = searchWord(dict, (cu_str)source, strlen(source), &wordIndex, &wordLen);
494 
495  if (wordFound) {
496  printf("'%s' (idx=%lu, off=%i)\n", lstr(dict->text+ntohl(dict->offsets[wordIndex]), wordLen), wordIndex, ntohl(dict->offsets[wordIndex]));
497  }
498 
499  return wordFound;
500 }
501 #endif
502 
503 
504 
505 static char *gb_uncompress_by_dictionary_internal(GB_DICTIONARY *dict, /* GBDATA *gbd, */ GB_CSTR s_source, const size_t size, bool append_zero, size_t *new_size) {
506  cu_str source = (cu_str)s_source;
507  u_str dest;
508  u_str buffer;
509  cu_str text = dict->text;
510  int done = 0;
511  long left = size;
512 
513  dest = buffer = (u_str)GB_give_other_buffer(s_source, size+2);
514 
515  while (left && !done) {
516  int c;
517 
518  if ((c=*source++)&128) { // compressed data
519  int indexLen = GETVAL(c, INDEX_LEN);
520  unsigned long idx = GETVAL(c, INDEX);
521 
522  c = GETVAL(c, LEN); // ==wordLen
523  if (c) c += SHORTLEN_DECR;
524  else c = *source+++LONGLEN_DECR;
525 
526  gb_assert(indexLen>=0 && indexLen<=1);
527 
528  if (indexLen==0) {
529  idx = (idx << 8) | *source++;
530  }
531  else {
532  idx = (((idx << 8) | source[1]) << 8) | source[0];
533  source += 2;
534  }
535 
536  gb_assert(idx<(GB_ULONG)dict->words);
537 
538  {
539  cu_str word = text+INDEX_DICT_OFFSET(idx, dict);
540 
541 #if DUMP_COMPRESSION_TEST>=2
542  printf(" word='%s' (idx=%lu, off=%li, len=%i)\n",
543  lstr(word, c), idx, (long)ntohl(dict->offsets[idx]), c);
544 #endif
545 
546  {
547  u_str d = dest;
548  gb_assert(((d + c) <= word) || (d >= (word + c)));
549  while (c--) *d++ = *word++; // LOOP_VECTORIZED
550  dest = d;
551  }
552  }
553  }
554  else { // uncompressed bytes
555  if (c & LAST_COMPRESSED_BIT) {
556  done = 1;
557  c ^= LAST_COMPRESSED_BIT;
558  }
559 
560  left -= c;
561  {
562  u_str d = dest;
563  gb_assert(((d + c) <= source) || (d >= (source + c)));
564  while (c--) *d++ = *source++; // LOOP_VECTORIZED
565  dest=d;
566  }
567  }
568  }
569 
570  if (append_zero) *dest++ = 0;
571 
572  *new_size = dest-buffer;
573  gb_assert(size >= *new_size); // buffer overflow
574 
575  return (char *)buffer;
576 }
577 
578 char *gb_uncompress_by_dictionary(GBDATA *gbd, GB_CSTR s_source, size_t size, size_t *new_size) {
580  bool append_zero = gbd->is_a_string();
581 
582  if (!dict) {
583  GB_ERROR error = GBS_global_string("Cannot decompress db-entry '%s' (no dictionary found)\n", GB_get_db_path(gbd));
584  GB_export_error(error);
585  return NULp;
586  }
587 
588  return gb_uncompress_by_dictionary_internal(dict, s_source, size, append_zero, new_size);
589 }
590 
591 char *gb_compress_by_dictionary(GB_DICTIONARY *dict, GB_CSTR s_source, size_t size, size_t *msize, int last_flag, int search_backward, int search_forward) {
592  cu_str source = (cu_str)s_source;
593  u_str dest;
594  u_str buffer;
595  cu_str unknown = source; // start of uncompressible bytes
596  u_str lastUncompressed = NULp; // ptr to start of last block of uncompressible bytes (in dest)
597 
598 #if defined(ASSERTION_USED)
599  const size_t org_size = size;
600 #endif // ASSERTION_USED
601 
602  gb_assert(size>0); // compression of zero-length data fails!
603 
604  dest = buffer = (u_str)GB_give_other_buffer((GB_CSTR)source, 1+(size/63+1)+size);
605  *dest++ = GB_COMPRESSION_DICTIONARY | last_flag;
606 
607  while (size) {
608  unsigned long wordIndex;
609  int wordLen;
610  int wordFound;
611 
612  if ((wordFound = searchWord(dict, source, size, &wordIndex, &wordLen))) {
613  int length;
614 
615  takeRest :
616  length = source-unknown;
617 
618  if (length) {
619  int shift;
620  int takeShift = 0;
621  int maxShift = (int)min(search_forward, wordLen-1);
622 
623  for (shift=1; shift<=maxShift; shift++) {
624  unsigned long wordIndex2;
625  int wordLen2;
626  int wordFound2;
627 
628  if ((wordFound2 = searchWord(dict, source+shift, size-shift, &wordIndex2, &wordLen2))) {
629  if (wordLen2>(wordLen+shift)) {
630  wordIndex = wordIndex2;
631  wordLen = wordLen2;
632  takeShift = shift;
633  }
634  }
635  }
636 
637  if (takeShift) {
638  source += takeShift;
639  size -= takeShift;
640  length = source-unknown;
641  }
642  }
643 
644  while (length) { // if there were uncompressible bytes
645  int take = (int)min(length, 63);
646 
647 #ifdef COUNT_CHUNKS
648  uncompressedBlocks[take]++;
649 #endif
650 
651  lastUncompressed = dest;
652 
653  *dest++ = take; // tag byte
654  memcpy(dest, unknown, take);
655  dest += take;
656  unknown += take;
657  length -= take;
658  }
659 
660  gb_assert(unknown==source);
661 
662  while (wordFound) { // as long as we find words in dictionary
663  int indexLen = wordIndex>MAX_SHORT_INDEX;
664  int indexHighBits = indexLen==0 ? wordIndex>>8 : wordIndex>>16;
665  int nextWordFound;
666  int nextWordLen;
667  unsigned long nextWordIndex;
668 
669  gb_assert((long)wordIndex<dict->words);
670  gb_assert((long)wordIndex <= MAX_LONG_INDEX);
671  gb_assert(indexHighBits==(indexHighBits & BITMASK(INDEX_BITS)));
672  gb_assert(wordLen>=MIN_SHORTLEN);
673 
674  lastUncompressed = NULp;
675 
676  {
677  cu_str source2 = source+wordLen;
678  long size2 = size-wordLen;
679 
680  if (!(nextWordFound=searchWord(dict, source+wordLen, size-wordLen, &nextWordIndex, &nextWordLen))) { // no word right afterwards
681  int shift;
682 
683  for (shift=1; shift<=search_backward && shift<(wordLen-MIN_COMPR_WORD_LEN); shift++) {
684  // try to cut end of word to get a better result
685  unsigned long wordIndex2;
686  int wordLen2;
687  int wordFound2;
688 
689  if ((wordFound2=searchWord(dict, source2-shift, size2+shift, &wordIndex2, &wordLen2))) {
690  if (wordLen2>(shift+1)) {
691  wordLen -= shift;
692 
693  nextWordFound = 1;
694  nextWordIndex = wordIndex2;
695  nextWordLen = wordLen2;
696  break;
697  }
698  }
699  }
700  }
701  }
702 
703 #ifdef COUNT_CHUNKS
704  compressedBlocks[wordLen]++;
705 #endif
706 
707 #if DUMP_COMPRESSION_TEST>=2
708  printf(" word='%s' (idx=%li, off=%i, len=%i)\n",
709  dict_word(dict, wordIndex, wordLen), wordIndex, (int)ntohl(dict->offsets[wordIndex]), wordLen);
710 #endif
711 
712  if (wordLen<=MAX_SHORTLEN) {
713  *dest++ = 128 |
714  (indexLen << INDEX_LEN_SHIFT) |
715  (indexHighBits << INDEX_SHIFT) |
716  ((wordLen-SHORTLEN_DECR) << LEN_SHIFT);
717  }
718  else {
719  *dest++ = 128 |
720  (indexLen << INDEX_LEN_SHIFT) |
721  (indexHighBits << INDEX_SHIFT);
722  *dest++ = wordLen-LONGLEN_DECR; // extra length byte
723  }
724 
725  *dest++ = (char)wordIndex; // low index byte
726  if (indexLen)
727  *dest++ = (char)(wordIndex >> 8); // high index byte
728 
729  unknown = source += wordLen;
730  size -= wordLen;
731 
732  wordFound = nextWordFound;
733  wordIndex = nextWordIndex;
734  wordLen = nextWordLen;
735  }
736  }
737  else {
738  source++;
739  if (--size==0) goto takeRest;
740  }
741  }
742 
743  if (lastUncompressed) *lastUncompressed |= LAST_COMPRESSED_BIT;
744  else *dest++ = LAST_COMPRESSED_BIT;
745 
746  *msize = dest-buffer;
747 
748 #if defined(ASSERTION_USED)
749  {
750  size_t new_size = -1;
751  char *test = gb_uncompress_by_dictionary_internal(dict, (GB_CSTR)buffer+1, org_size + GB_COMPRESSION_TAGS_SIZE_MAX, true, &new_size);
752 
753  gb_assert(memcmp(test, s_source, org_size) == 0);
754  gb_assert((org_size+1) == new_size);
755  }
756 #endif // ASSERTION_USED
757 
758  return (char*)buffer;
759 }
760 
761 
762 #if defined(TEST_DICT)
763 
764 static void test_dictionary(GB_DICTIONARY *dict, O_gbdByKey *gbk, long *uncompSum, long *compSum) {
765  long uncompressed_sum = 0;
766  long compressed_sum = 0;
767 
768  long dict_size = (dict->words*2+1)*sizeof(GB_NINT)+dict->textlen;
769 
770  long char_count[256];
771  for (int i=0; i<256; i++) char_count[i] = 0;
772 
773  printf(" * Testing compression..\n");
774 
775 #ifdef COUNT_CHUNKS
776  clearChunkCounters();
777 #endif
778 
779  for (int cnt=0; cnt<gbk->cnt; cnt++) {
780  GBDATA *gbd = gbk->gbds[cnt];
781 
782  if (COMPRESSIBLE(gbd->type())) {
783  size_t size;
784  cu_str data = get_data_n_size(gbd, &size);
785 
786  if (gbd->is_a_string()) size--;
787  if (size<1) continue;
788 
790  gb_assert(copy!=0);
791  memcpy(copy, data, size);
792 
793 #if DUMP_COMPRESSION_TEST>=1
794  printf("----------------------------\n");
795  printf("original : %3li b = '%s'\n", size, data);
796 #endif
797 
798  int last_flag = 0;
799  size_t compressedSize;
800  u_str compressed = (u_str)gb_compress_by_dictionary(dict, (GB_CSTR)data, size, &compressedSize, last_flag, 9999, 2);
801 
802 #if DUMP_COMPRESSION_TEST>=1
803  printf("compressed : %3li b = '%s'\n", compressedSize, lstr(compressed, compressedSize));
804  dumpBinary(compressed, compressedSize);
805 #endif
806 
807  for (size_t i=0; i<compressedSize; i++) char_count[compressed[i]]++;
808 
809  size_t new_size = -1;
810  u_str uncompressed = (u_str)gb_uncompress_by_dictionary(gbd, (char*)compressed+1, size+GB_COMPRESSION_TAGS_SIZE_MAX, &new_size);
811 
812 #if DUMP_COMPRESSION_TEST>=1
813  printf("copy : %3li b = '%s'\n", size, lstr(copy, size));
814  printf("decompressed: %3li b = '%s'\n", size, lstr(uncompressed, size));
815 #endif
816 
817  if (GB_MEMCMP(copy, uncompressed, size)!=0) {
818  int byte = 0;
819 
820  while (copy[byte]==uncompressed[byte]) byte++;
821  printf("Error in compression (off=%i, '%s'", byte, lstr(copy+byte, 10));
822  printf("!='%s'\n", lstr(uncompressed+byte, 10));
823  }
824 
825  if (compressedSize<size) {
826  uncompressed_sum += size;
827  compressed_sum += compressedSize;
828  }
829  else {
830  uncompressed_sum += size;
831  compressed_sum += size;
832  }
833 
834  gbm_free_mem(copy, size, GBM_DICT_INDEX);
835  }
836  }
837 
838 #ifdef COUNT_CHUNKS
839  dumpChunkCounters();
840 #endif
841 
842  {
843  long compressed_plus_dict = compressed_sum+dict_size;
844  char *dict_text = GBS_global_string_copy("+dict %li b", dict_size);
845  long ratio = (compressed_plus_dict*100)/uncompressed_sum;
846 
847  printf(" uncompressed size = %10li b\n"
848  " compressed size = %10li b\n"
849  " %17s = %10li b (Ratio=%li%%)\n",
850  uncompressed_sum,
851  compressed_sum,
852  dict_text, compressed_plus_dict, ratio);
853 
854  free(dict_text);
855  }
856 
857  *uncompSum += uncompressed_sum;
858  *compSum += compressed_sum+dict_size;
859 }
860 
861 #endif // TEST_DICT
862 
863 
864 // ******************* Build dictionary ******************
865 
866 #ifdef DEBUG
867 #define TEST // test trees?
868 // #define DUMP_TREE // dump trees?
869 
870 // #define DUMP_EXPAND
871 /*
872  #define SELECT_WORDS
873  #define SELECTED_WORDS "oropl"
874 */
875 
876 # ifdef SELECT_WORDS
877 static char *strnstr(char *s1, int len, char *s2) {
878  char c = *s2;
879  int len2 = strlen(s2);
880 
881  while (len-->=len2) {
882  if (*s1==c) {
883  if (strncmp(s1, s2, len2)==0) return s1;
884  }
885  s1++;
886  }
887 
888  return NULp;
889 }
890 # endif
891 
892 #ifdef DUMP_TREE
893 static void dump_dtree(int deep, DictTree tree) {
894  static unsigned_char buffer[1024];
895 
896  if (tree.full) {
897  switch (tree.full->typ) {
898  case FULL_NODE: {
899  int idx;
900 
901  for (idx=0; idx<256; idx++) {
902  buffer[deep] = idx;
903  buffer[deep+1] = 0;
904 
905  if (tree.full->son[idx].exists) dump_dtree(deep+1, tree.full->son[idx]);
906  else if (tree.full->count[idx]>0) printf(" '%s' (%i) [array]\n", buffer, tree.full->count[idx]);
907  }
908  break;
909  }
910  case SINGLE_NODE: {
911  buffer[deep] = tree.single->ch;
912  buffer[deep+1] = 0;
913 
914  if (tree.single->son.exists) dump_dtree(deep+1, tree.single->son);
915  else printf(" '%s' (%i) [single]\n", buffer, tree.single->count);
916 
917  if (tree.single->brother.exists) dump_dtree(deep, tree.single->brother);
918  break;
919  }
920  }
921  }
922 }
923 #endif
924 
925 #else
926 #ifdef DUMP_TREE
927 # define dump_dtree(deep, tree)
928 #endif
929 #endif
930 
931 #ifdef TEST
932 static int testCounts(DictTree tree) {
933  // tests if all inner nodes have correct 'count's
934  int cnt = 0;
935 
936  if (tree.exists) {
937  switch (tree.full->typ) {
938  case SINGLE_NODE: {
939  while (tree.exists) {
940  if (tree.single->son.exists) {
941  int son_cnt = testCounts(tree.single->son);
942 #ifdef COUNT_EQUAL
943  gb_assert(son_cnt==tree.single->count);
944 #else
945  gb_assert(son_cnt<=tree.single->count);
946 #endif
947  }
948 
949  gb_assert(tree.single->count>0);
950  cnt += tree.single->count;
951  tree = tree.single->brother;
952  }
953  break;
954  }
955  case FULL_NODE: {
956  int idx,
957  sons = 0;
958 
959  for (idx=0; idx<256; idx++) {
960  if (tree.full->son[idx].exists) {
961  int son_cnt = testCounts(tree.full->son[idx]);
962 #ifdef COUNT_EQUAL
963  gb_assert(son_cnt==tree.full->count[idx]);
964 #else
965  gb_assert(son_cnt<=tree.full->count[idx]);
966 #endif
967  if (tree.full->usedSons) gb_assert(tree.full->count[idx]>0);
968  else gb_assert(tree.full->count[idx]==0);
969 
970  sons++;
971  }
972  else if (tree.full->count[idx]) {
973  sons++;
974  }
975 
976  cnt += tree.full->count[idx];
977  }
978 
979  gb_assert(sons==tree.full->usedSons);
980  break;
981  }
982  }
983  }
984 
985  return cnt;
986 }
987 
988 // #define TEST_MAX_OCCUR_COUNT
989 
990 #ifdef TEST_MAX_OCCUR_COUNT
991 #define MAX_OCCUR_COUNT 600000
992 #endif
993 
994 static DictTree test_dtree(DictTree tree) {
995  // only correct while tree is under contruction (build_dict_tree())
996 
997  if (tree.exists) {
998  switch (tree.full->typ) {
999  case SINGLE_NODE: {
1000 #if defined(TEST_MAX_OCCUR_COUNT)
1001  gb_assert(tree.single->count<MAX_OCCUR_COUNT); // quite improbable
1002 #endif // TEST_MAX_OCCUR_COUNT
1003 
1004  if (tree.single->son.exists) {
1005  gb_assert(tree.single->count==0);
1006  test_dtree(tree.single->son);
1007  }
1008  else {
1009  gb_assert(tree.single->count>0);
1010  }
1011 
1012  if (tree.single->brother.exists) test_dtree(tree.single->brother);
1013  break;
1014  }
1015  case FULL_NODE: {
1016  int idx;
1017  int countSons = 0;
1018 
1019  for (idx=0; idx<256; idx++) {
1020 #if defined(TEST_MAX_OCCUR_COUNT)
1021  gb_assert(tree.full->count[idx]<MAX_OCCUR_COUNT); // quite improbable
1022 #endif // TEST_MAX_OCCUR_COUNT
1023 
1024  if (tree.full->son[idx].exists) {
1025  gb_assert(tree.full->count[idx]==0);
1026  test_dtree(tree.full->son[idx]);
1027  countSons++;
1028  }
1029  else {
1030  gb_assert(tree.full->count[idx]>=0);
1031  if (tree.full->count[idx]>0)
1032  countSons++;
1033  }
1034  }
1035 
1036  gb_assert(countSons==tree.full->usedSons);
1037 
1038  break;
1039  }
1040  }
1041  }
1042 
1043  return tree;
1044 }
1045 
1046 #else
1047 # define test_dtree(tree) // (tree)
1048 # define testCounts(tree) // 0
1049 #endif
1050 
1051 
1052 static DictTree new_dtree(cu_str text, long len, long *memcount) {
1053  // creates a new (sub-)tree from 'text' (which has length 'len')
1054  DictTree tree;
1055 
1056  if (len) {
1057  SingleDictTree *tail = NULp;
1058  SingleDictTree *head = NULp;
1059 
1060  while (len) {
1061  if (tail) tail = tail->son.single = (SingleDictTree*)gbm_get_mem(sizeof(*tail), GBM_DICT_INDEX);
1062  else tail = head = (SingleDictTree*)gbm_get_mem(sizeof(*tail), GBM_DICT_INDEX);
1063 
1064  (*memcount) += sizeof(*tail);
1065 
1066  tail->typ = SINGLE_NODE;
1067  tail->ch = *text++;
1068  len--;
1069 
1070  tail->brother.single = NULp;
1071  tail->son.single = NULp;
1072  }
1073 
1074  tail->count = 1;
1075  tree.single = head;
1076  }
1077  else {
1078  tree.single = NULp;
1079  }
1080 
1081  return tree;
1082 }
1083 
1084 static DictTree single2full_dtree(DictTree tree, long *memcount) {
1085  if (tree.exists && tree.single->typ==SINGLE_NODE) {
1086  FullDictTree *full = (FullDictTree*)gbm_get_mem(sizeof(*full), GBM_DICT_INDEX);
1087  int idx;
1088 
1089  (*memcount) += sizeof(*full);
1090  full->typ = FULL_NODE;
1091  full->usedSons = 0;
1092 
1093  for (idx=0; idx<256; idx++) { // LOOP_VECTORIZED =1|3 // this depends on whether this function gets inlined (differs depending on DEBUG/UNIT_TEST "both off" vs. "at least one active")
1094  full->son[idx].exists = NULp;
1095  full->count[idx] = 0;
1096  }
1097 
1098  while (tree.exists) {
1099  SingleDictTree *t = tree.single;
1100 
1101  gb_assert(t->typ==SINGLE_NODE);
1102  gb_assert(!full->son[t->ch].exists);
1103 
1104  full->son[t->ch] = t->son;
1105  full->count[t->ch] = t->count;
1106  full->usedSons++;
1107 
1108  tree.single = t->brother.single;
1109 
1110  gbm_free_mem(t, sizeof(*t), GBM_DICT_INDEX);
1111  (*memcount) -= sizeof(*t);
1112  }
1113 
1114  tree.full = full;
1115  }
1116 
1117  return tree;
1118 }
1119 
1120 static void free_dtree(DictTree tree) {
1121  if (tree.exists) {
1122  switch (tree.full->typ) {
1123  case SINGLE_NODE: {
1124  if (tree.single->son.exists) free_dtree(tree.single->son);
1125  if (tree.single->brother.exists) free_dtree(tree.single->brother);
1126 
1127  gbm_free_mem(tree.single, sizeof(*(tree.single)), GBM_DICT_INDEX);
1128  break;
1129  }
1130  case FULL_NODE: {
1131  int idx;
1132 
1133  for (idx=0; idx<256; idx++) if (tree.full->son[idx].exists) free_dtree(tree.full->son[idx]);
1134  gbm_free_mem(tree.full, sizeof(*(tree.full)), GBM_DICT_INDEX);
1135  break;
1136  }
1137  }
1138  }
1139 }
1140 
1141 
1142 
1143 static DictTree cut_dtree(DictTree tree, int cut_count, long *memcount, long *leafcount) {
1144  /* removes all branches from 'tree' which are referenced less/equal than cut_count
1145  * returns: the reduced tree
1146  */
1147  if (tree.exists) {
1148  switch (tree.full->typ) {
1149  case SINGLE_NODE: {
1150  if (tree.single->son.exists) tree.single->son = cut_dtree(tree.single->son, cut_count, memcount, leafcount);
1151 
1152  if (!tree.single->son.exists) { // leaf
1153  if (tree.single->count<=cut_count) { // leaf with less/equal references
1154  DictTree brother = tree.single->brother;
1155 
1156  gbm_free_mem(tree.single, sizeof(*tree.single), GBM_DICT_INDEX);
1157  (*memcount) -= sizeof(*tree.single);
1158  if (brother.exists) return cut_dtree(brother, cut_count, memcount, leafcount);
1159 
1160  tree.single = NULp;
1161  break;
1162  }
1163  else {
1164  (*leafcount)++;
1165  }
1166  }
1167 
1168  if (tree.single->brother.exists) tree.single->brother = cut_dtree(tree.single->brother, cut_count, memcount, leafcount);
1169  break;
1170  }
1171  case FULL_NODE: {
1172  int idx;
1173  int count = 0;
1174 
1175  for (idx=0; idx<256; idx++) {
1176  if (tree.full->son[idx].exists) {
1177  tree.full->son[idx] = cut_dtree(tree.full->son[idx], cut_count, memcount, leafcount);
1178 
1179  if (tree.full->son[idx].exists) count++;
1180  else tree.full->count[idx] = 0;
1181  }
1182  else if (tree.full->count[idx]>0) {
1183  if (tree.full->count[idx]<=cut_count) {
1184  tree.full->count[idx] = 0;
1185  }
1186  else {
1187  count++;
1188  (*leafcount)++;
1189  }
1190  }
1191  }
1192 
1193  tree.full->usedSons = count;
1194 
1195  if (!count) { // no more sons
1196  gbm_free_mem(tree.full, sizeof(*(tree.full)), GBM_DICT_INDEX);
1197  (*memcount) -= sizeof(*(tree.full));
1198  tree.exists = NULp;
1199  }
1200 
1201  break;
1202  }
1203  }
1204  }
1205 
1206  return tree;
1207 }
1208 static DictTree cut_useless_words(DictTree tree, int deep, long *removed) {
1209  /* removes/shortens all branches of 'tree' which are not useful for compression
1210  * 'deep' should be zero (incremented by cut_useless_words)
1211  * 'removed' will be set to the # of removed occurrences
1212  * returns: the reduced tree
1213  */
1214  *removed = 0;
1215 
1216  if (tree.exists) {
1217  deep++;
1218 
1219  switch (tree.full->typ) {
1220  long removed_single;
1221 
1222  case SINGLE_NODE: {
1223  if (tree.single->son.exists) {
1224  tree.single->son = cut_useless_words(tree.single->son, deep, &removed_single);
1225  tree.single->count -= removed_single;
1226  *removed += removed_single;
1227  }
1228 
1229  if (!tree.single->son.exists && !WORD_HELPFUL(deep, tree.single->count)) {
1230  DictTree brother = tree.single->brother;
1231 
1232  *removed += tree.single->count;
1233  gbm_free_mem(tree.single, sizeof(*tree.single), GBM_DICT_INDEX);
1234 
1235  if (brother.exists) {
1236  tree = cut_useless_words(brother, deep-1, &removed_single);
1237  *removed += removed_single;
1238  }
1239  else {
1240  tree.exists = NULp;
1241  }
1242 
1243  break;
1244  }
1245 
1246  if (tree.single->brother.exists) {
1247  tree.single->brother = cut_useless_words(tree.single->brother, deep-1, &removed_single);
1248  *removed += removed_single;
1249  }
1250 
1251  break;
1252  }
1253  case FULL_NODE: {
1254  int idx;
1255  int count = 0;
1256 
1257  for (idx=0; idx<256; idx++) {
1258  if (tree.full->son[idx].exists) {
1259  tree.full->son[idx] = cut_useless_words(tree.full->son[idx], deep, &removed_single);
1260  tree.full->count[idx] -= removed_single;
1261  *removed += removed_single;
1262  }
1263 
1264  if (tree.full->son[idx].exists) {
1265  count++;
1266  }
1267  else if (tree.full->count[idx]) {
1268  if (!WORD_HELPFUL(deep, tree.full->count[idx])) { // useless!
1269  *removed += tree.full->count[idx];
1270  tree.full->count[idx] = 0;
1271  }
1272  else {
1273  count++;
1274  }
1275  }
1276  }
1277 
1278  tree.full->usedSons = count;
1279 
1280  if (!count) { // no more sons
1281  gbm_free_mem(tree.full, sizeof(*(tree.full)), GBM_DICT_INDEX);
1282  tree.exists = NULp;
1283  }
1284 
1285  break;
1286  }
1287  }
1288  }
1289 
1290  return tree;
1291 }
1292 
1293 static DictTree add_dtree_to_dtree(DictTree toAdd, DictTree to, long *memcount) {
1294  /* adds 'toAdd' as brother of 'to' (must be leftmost of all SINGLE_NODEs or a FULL_NODE)
1295  * returns: the leftmost of all SINGLE_NODEs or a FULL_NODE
1296  */
1297  DictTree tree = toAdd;
1298 
1299  gb_assert(toAdd.single->typ==SINGLE_NODE);
1300 
1301  if (to.exists) {
1302  switch (to.full->typ) {
1303  case SINGLE_NODE: {
1304  SingleDictTree *left = to.single;
1305 
1306  gb_assert(left);
1307 
1308  if (toAdd.single->ch < to.single->ch) {
1309  toAdd.single->brother = to;
1310  return toAdd;
1311  }
1312 
1313  while (to.single->brother.exists) {
1314  if (toAdd.single->ch < to.single->brother.single->ch) {
1315  toAdd.single->brother = to.single->brother;
1316  to.single->brother = toAdd;
1317 
1318  tree.single = left;
1319  return tree;
1320  }
1321  to = to.single->brother;
1322  }
1323 
1324  to.single->brother = toAdd;
1325  tree.single = left;
1326  break;
1327  }
1328  case FULL_NODE: {
1329  unsigned_char ch = toAdd.single->ch;
1330 
1331  gb_assert(!to.full->son[ch].exists);
1332  gb_assert(to.full->count[ch]==0); // if this fails, count must be added & tested
1333  gb_assert(!toAdd.single->brother.exists);
1334 
1335  to.full->son[ch] = toAdd.single->son;
1336  to.full->count[ch] = toAdd.single->count;
1337  to.full->usedSons++;
1338 
1339  tree = to;
1340 
1341  gbm_free_mem(toAdd.single, sizeof(*(toAdd.single)), GBM_DICT_INDEX);
1342  (*memcount) -= sizeof(toAdd.single);
1343 
1344  break;
1345  }
1346  }
1347  }
1348 
1349  return tree;
1350 }
1351 
1352 static DictTree add_to_dtree(DictTree tree, cu_str text, long len, long *memcount) {
1353  /* adds the string 'text' (which has length 'len') to 'tree'
1354  * returns: new tree
1355  */
1356  if (tree.exists) {
1357  switch (tree.full->typ) {
1358  case SINGLE_NODE: {
1359  SingleDictTree *t = tree.single;
1360  int count = 0;
1361 
1362  do {
1363  count++;
1364  if (t->ch==text[0]) { // we found an existing subtree
1365  if (len>1) {
1366  t->son = add_to_dtree(t->son, text+1, len-1, memcount); // add rest of text to subtree
1367  }
1368  else {
1369  gb_assert(len==1);
1370  gb_assert(!t->son.exists);
1371  t->count++;
1372  }
1373 
1374  return count>MAX_BROTHERS ? single2full_dtree(tree, memcount) : tree;
1375  }
1376  else if (t->ch > text[0]) {
1377  break;
1378  }
1379  }
1380  while ((t=t->brother.single));
1381 
1382  tree = add_dtree_to_dtree(new_dtree(text, len, memcount), // otherwise we create a new subtree
1383  count>MAX_BROTHERS ? single2full_dtree(tree, memcount) : tree,
1384  memcount);
1385  break;
1386  }
1387  case FULL_NODE: {
1388  unsigned_char ch = text[0];
1389 
1390  if (tree.full->son[ch].exists) {
1391  tree.full->son[ch] = add_to_dtree(tree.full->son[ch], text+1, len-1, memcount);
1392  }
1393  else {
1394  tree.full->son[ch] = new_dtree(text+1, len-1, memcount);
1395  if (!tree.full->son[ch].exists) {
1396  if (tree.full->count[ch]==0) tree.full->usedSons++;
1397  tree.full->count[ch]++;
1398  }
1399  else {
1400  tree.full->usedSons++;
1401  }
1402  }
1403  break;
1404  }
1405  }
1406 
1407  return tree;
1408  }
1409 
1410  return new_dtree(text, len, memcount);
1411 }
1412 
1413 static long calcCounts(DictTree tree) {
1414  long cnt = 0;
1415 
1416  gb_assert(tree.exists);
1417 
1418  switch (tree.full->typ) {
1419  case SINGLE_NODE: {
1420  while (tree.exists) {
1421  if (tree.single->son.exists) tree.single->count = calcCounts(tree.single->son);
1422  gb_assert(tree.single->count>0);
1423  cnt += tree.single->count;
1424  tree = tree.single->brother;
1425  }
1426  break;
1427  }
1428  case FULL_NODE: {
1429  int idx;
1430 
1431  for (idx=0; idx<256; idx++) {
1432  if (tree.full->son[idx].exists) {
1433  tree.full->count[idx] = calcCounts(tree.full->son[idx]);
1434  gb_assert(tree.full->count[idx]>0);
1435  }
1436  else {
1437  gb_assert(tree.full->count[idx]>=0);
1438  }
1439  cnt += tree.full->count[idx];
1440  }
1441  break;
1442  }
1443  }
1444 
1445  return cnt;
1446 }
1447 
1448 static int count_dtree_leafs(DictTree tree, int deep, int *maxdeep) {
1449  // returns # of leafs and max. depth of tree
1450  int leafs = 0;
1451 
1452  gb_assert(tree.exists);
1453 
1454  if (++deep>*maxdeep) *maxdeep = deep;
1455 
1456  switch (tree.full->typ) {
1457  case SINGLE_NODE: {
1458  if (tree.single->son.exists) leafs += count_dtree_leafs(tree.single->son, deep, maxdeep);
1459  else leafs++;
1460  if (tree.single->brother.exists) leafs += count_dtree_leafs(tree.single->brother, deep, maxdeep);
1461  break;
1462  }
1463  case FULL_NODE: {
1464  int idx;
1465 
1466  for (idx=0; idx<256; idx++) {
1467  if (tree.full->son[idx].exists) leafs += count_dtree_leafs(tree.full->son[idx], deep, maxdeep);
1468  else if (tree.full->count[idx]) leafs++;
1469  }
1470  break;
1471  }
1472  }
1473 
1474  return leafs;
1475 }
1476 
1477 static int COUNT(DictTree tree) {
1478  // counts sum of # of occurrences of tree
1479  int cnt = 0;
1480 
1481  switch (tree.single->typ) {
1482  case SINGLE_NODE: {
1483  while (tree.exists) {
1484  cnt += tree.single->count;
1485  tree = tree.single->brother;
1486  }
1487  break;
1488  }
1489  case FULL_NODE: {
1490  int idx;
1491 
1492  for (idx=0; idx<256; idx++) cnt += tree.full->count[idx]; // LOOP_VECTORIZED =4[<10] =18 // 4x with 4.9.2; 4x with 5.x; 18x with 10.1+10.2+10.3; (Note: has been 1x for <=4.9.3 before; was found wrong for 4.9.2)
1493  break;
1494  }
1495  }
1496 
1497  return cnt;
1498 }
1499 
1500 static DictTree removeSubsequentString(DictTree *tree_pntr, cu_str buffer, int len, int max_occur) {
1501  /* searches tree for 'buffer' (length='len')
1502  *
1503  * returns - rest below found string
1504  * (if found and if the # of occurrences of the string is less/equal than 'max_occur')
1505  * - NULp otherwise
1506  *
1507  * removes the whole found string from the tree (not only the rest!)
1508  */
1509  DictTree tree = *tree_pntr, rest;
1510  static int restCount;
1511 
1512  rest.exists = NULp;
1513 
1514  gb_assert(tree.exists);
1515  gb_assert(len>0);
1516 
1517  switch (tree.full->typ) {
1518  case SINGLE_NODE: {
1519  while (tree.single->ch <= buffer[0]) {
1520  if (tree.single->ch == buffer[0]) { // found wanted character
1521  if (tree.single->son.exists) {
1522  if (len==1) {
1523  if (tree.single->count <= max_occur) {
1524  rest = tree.single->son;
1525  restCount = COUNT(rest);
1526  tree.single->son.exists = NULp;
1527  }
1528  }
1529  else {
1530  rest = removeSubsequentString(&tree.single->son, buffer+1, len-1, max_occur);
1531  }
1532  }
1533 
1534  if (rest.exists) { // the string was found
1535  tree.single->count -= restCount;
1536  gb_assert(tree.single->count >= 0);
1537 
1538  if (!tree.single->count) { // empty subtree -> delete myself
1539  DictTree brother = tree.single->brother;
1540 
1541  tree.single->brother.exists = NULp; // elsewise it would be freed by free_dtree
1542  free_dtree(tree);
1543  *tree_pntr = tree = brother;
1544  }
1545  }
1546 
1547  break;
1548  }
1549 
1550  tree_pntr = &(tree.single->brother);
1551  if (!(tree = tree.single->brother).exists) break;
1552  }
1553 
1554  break;
1555  }
1556  case FULL_NODE: {
1557  unsigned_char ch;
1558 
1559  if (tree.full->son[ch=buffer[0]].exists) {
1560  if (len==1) {
1561  if (tree.full->count[ch] <= max_occur) {
1562  rest = tree.full->son[ch];
1563  restCount = COUNT(rest);
1564  tree.full->son[ch].exists = NULp;
1565  }
1566  }
1567  else {
1568  rest = removeSubsequentString(&tree.full->son[ch], buffer+1, len-1, max_occur);
1569  }
1570 
1571  if (rest.exists) {
1572  gb_assert(restCount>0);
1573  tree.full->count[ch] -= restCount;
1574  gb_assert(tree.full->count[ch]>=0);
1575  if (tree.full->count[ch]==0) {
1576  gb_assert(!tree.full->son[ch].exists);
1577 
1578  if (--tree.full->usedSons==0) { // last son deleted -> delete myself
1579  free_dtree(tree);
1580  tree.exists = NULp;
1581  *tree_pntr = tree;
1582  }
1583  }
1584  }
1585  }
1586 
1587  break;
1588  }
1589  }
1590 
1591  return rest;
1592 }
1593 
1594 static cu_str memstr(cu_str stringStart, int stringStartLen, cu_str inString, int inStringLen) {
1595  if (!inStringLen) return stringStart; // string of length zero is found everywhere
1596 
1597  while (stringStartLen) {
1598  cu_str found = (cu_str)memchr(stringStart, inString[0], stringStartLen);
1599 
1600  if (!found) break;
1601 
1602  stringStartLen -= found-stringStart;
1603  stringStart = found;
1604 
1605  if (stringStartLen<inStringLen) break;
1606 
1607  if (GB_MEMCMP(stringStart, inString, inStringLen)==0) return stringStart;
1608 
1609  stringStart++;
1610  stringStartLen--;
1611  }
1612 
1613  return NULp;
1614 }
1615 
1616 
1617 static int expandBranches(u_str buffer, int deep, int minwordlen, int maxdeep, DictTree tree, DictTree root, int max_percent) {
1618  /* expands all branches in 'tree'
1619  *
1620  * this is done by searching every of these branches in 'root' and moving any subsequent parts from there to 'tree'
1621  * (this is only done, if the # of occurrences of the found part does not exceed the # of occurrences of 'tree' more than 'max_percent' percent)
1622  *
1623  * 'buffer' strings are rebuild here while descending the tree (length of buffer==MAX_WORD_LEN)
1624  * 'deep' recursion level
1625  * 'maxdeep' maximum recursion level
1626  * 'minwordlen' is the length of the words to search (usually equal to MIN_WORD_LEN-1)
1627  *
1628  * returns the # of occurrences which were added to 'tree'
1629  */
1630  int expand = 0; // calculate count-sum of added subsequent parts
1631 
1632  gb_assert(tree.exists);
1633 
1634  if (deep<maxdeep) {
1635  switch (tree.full->typ) {
1636  case SINGLE_NODE: {
1637  while (tree.exists) {
1638  buffer[deep] = tree.single->ch;
1639 
1640  if (!tree.single->son.exists) {
1641  DictTree rest;
1642  u_str buf = buffer+1;
1643  int len = deep;
1644 
1645  if (len>minwordlen) { // do not search more than MIN_WORD_LEN-1 chars
1646  buf += len-minwordlen;
1647  len = minwordlen;
1648  }
1649 
1650  if (len==minwordlen) {
1651  cu_str self = memstr(buffer, deep+1, buf, len);
1652 
1653  gb_assert(self);
1654  if (self==buf) rest = removeSubsequentString(&root, buf, len, ((100+max_percent)*tree.single->count)/100);
1655  else rest.exists = NULp;
1656  }
1657  else {
1658  rest.exists = NULp;
1659  }
1660 
1661  if (rest.exists) {
1662  int cnt = COUNT(rest);
1663 
1664  tree.single->son = rest;
1665  tree.single->count += cnt;
1666  expand += cnt;
1667 #ifdef DUMP_EXPAND
1668 #define DUMP_MORE 1
1669  printf("expanding '%s'", lstr(buffer, deep+1+DUMP_MORE));
1670  printf(" (searching for '%s') -> found %i nodes\n", lstr(buf, len+DUMP_MORE), cnt);
1671 #endif
1672  }
1673  }
1674 
1675  if (tree.single->son.exists) {
1676  int added = expandBranches(buffer, deep+1, minwordlen, maxdeep, tree.single->son, root, max_percent);
1677 
1678  expand += added;
1679  tree.single->count += added;
1680  }
1681 
1682  tree = tree.single->brother;
1683  }
1684 
1685  break;
1686  }
1687  case FULL_NODE: {
1688  int idx;
1689 
1690  for (idx=0; idx<256; idx++) {
1691  buffer[deep] = idx;
1692 
1693  if (!tree.full->son[idx].exists && tree.full->count[idx]) { // leaf
1694  DictTree rest;
1695  u_str buf = buffer+1;
1696  int len = deep;
1697 
1698  if (len>minwordlen) { // do not search more than MIN_WORD_LEN-1 chars
1699  buf += len-minwordlen;
1700  len = minwordlen;
1701  }
1702 
1703  if (len==minwordlen) {
1704  cu_str self = memstr(buffer, deep+1, buf, len);
1705 
1706  gb_assert(self);
1707  if (self==buf)
1708  rest = removeSubsequentString(&root, buf, len, ((100+max_percent)*tree.full->count[idx])/100);
1709  else
1710  rest.exists = NULp;
1711  }
1712  else {
1713  rest.exists = NULp;
1714  }
1715 
1716  if (rest.exists) { // substring found!
1717  int cnt = COUNT(rest);
1718 
1719  if (tree.full->count[idx]==0) tree.full->usedSons++;
1720  tree.full->son[idx] = rest;
1721  tree.full->count[idx] += cnt;
1722 
1723  expand += cnt;
1724 #ifdef DUMP_EXPAND
1725  printf("expanding '%s'", lstr(buffer, deep+1+DUMP_MORE));
1726  printf(" (searching for '%s') -> found %i nodes\n", lstr(buf, len+DUMP_MORE), cnt);
1727 #endif
1728  }
1729  }
1730 
1731  if (tree.full->son[idx].exists) {
1732  int added = expandBranches(buffer, deep+1, minwordlen, maxdeep, tree.full->son[idx], root, max_percent);
1733 
1734  expand += added;
1735  tree.full->count[idx] += added;
1736  }
1737  }
1738 
1739  break;
1740  }
1741  }
1742  }
1743 
1744  return expand;
1745 }
1746 
1747 static DictTree build_dict_tree(O_gbdByKey *gbk, long maxmem, long maxdeep, size_t minwordlen, long *data_sum) {
1748  /* builds a tree of the most used words
1749  *
1750  * 'maxmem' is the amount of memory that will be allocated
1751  * 'maxdeep' is the maximum length of the _returned_ words
1752  * 'minwordlen' is the minimum length a word needs to get into the tree
1753  * this is used in the first pass as maximum tree depth
1754  * 'data_sum' will be set to the overall-size of data of which the tree was built
1755  */
1756  DictTree tree;
1757  long memcount = 0;
1758  long leafs = 0;
1759 
1760  *data_sum = 0;
1761 
1762  {
1763  int cnt;
1764  long lowmem = (maxmem*9)/10;
1765  int cut_count = 1;
1766 
1767  // Build 8-level-deep tree of all existing words
1768 
1769  tree.exists = NULp; // empty tree
1770 
1771  for (cnt=0; cnt<gbk->cnt; cnt++) {
1772  GBDATA *gbd = gbk->gbds[cnt];
1773 
1774  if (COMPRESSIBLE(gbd->type())) {
1775  size_t size;
1776  cu_str data = get_data_n_size(gbd, &size);
1777  cu_str lastWord;
1778 
1779  if (gbd->is_a_string()) size--;
1780  if (size<minwordlen) continue;
1781 
1782  *data_sum += size;
1783  lastWord = data+size-minwordlen;
1784 
1785 #ifdef SELECT_WORDS
1786  if (strnstr(data, size, SELECTED_WORDS)) // test some words only
1787 #endif
1788  {
1789 
1790  for (; data<=lastWord; data++) {
1791  tree = add_to_dtree(tree, data, minwordlen, &memcount);
1792 
1793  while (memcount>maxmem) {
1794  leafs = 0;
1795  tree = cut_dtree(tree, cut_count, &memcount, &leafs);
1796  if (memcount<=lowmem) break;
1797  cut_count++;
1798  }
1799  }
1800  }
1801  }
1802  }
1803  }
1804 
1805  {
1806  int cutoff = 1;
1807 
1808  leafs = 0;
1809  tree = cut_dtree(tree, cutoff, &memcount, &leafs); // cut all single elements
1810  test_dtree(tree);
1811 
1812 #if defined(DEBUG)
1813  if (tree.exists) {
1814  int maxdeep2 = 0;
1815  long counted = count_dtree_leafs(tree, 0, &maxdeep2);
1816  gb_assert(leafs == counted);
1817  }
1818 #endif // DEBUG
1819 
1820  // avoid directory overflow (max. 18bit)
1821  while (leafs >= MAX_LONG_INDEX) {
1822  leafs = 0;
1823  ++cutoff;
1824 #if defined(DEBUG)
1825  printf("Directory overflow (%li) -- reducing size (cutoff = %i)\n", leafs, cutoff);
1826 #endif // DEBUG
1827  tree = cut_dtree(tree, cutoff, &memcount, &leafs);
1828  }
1829  }
1830 #ifdef DUMP_TREE
1831  printf("----------------------- tree with short branches:\n");
1832  dump_dtree(0, tree);
1833  printf("---------------------------\n");
1834 #endif
1835 
1836  // Try to create longer branches
1837 
1838  if (tree.exists) {
1839  int add_count;
1840  u_str buffer = (u_str)gbm_get_mem(maxdeep, GBM_DICT_INDEX);
1841  int max_differ;
1842  long dummy;
1843 
1844  if (tree.full->typ != FULL_NODE) tree = single2full_dtree(tree, &memcount); // ensure root is FULL_NODE
1845 
1846  test_dtree(tree);
1847  calcCounts(tree); // calculate counters of inner nodes
1848  testCounts(tree);
1849 
1850  for (max_differ=0; max_differ<=MAX_DIFFER; max_differ+=INCR_DIFFER) { // percent of allowed difference for concatenating tree branches
1851  do {
1852  int idx;
1853  add_count = 0;
1854 
1855  for (idx=0; idx<256; idx++) {
1856  if (tree.full->son[idx].exists) {
1857  int added;
1858 
1859  buffer[0] = idx;
1860  added = expandBranches(buffer, 1, minwordlen-1, maxdeep, tree.full->son[idx], tree, max_differ);
1861  tree.full->count[idx] += added;
1862  add_count += added;
1863  }
1864  }
1865  }
1866  while (add_count);
1867  }
1868 
1869  gbm_free_mem(buffer, maxdeep, GBM_DICT_INDEX);
1870 
1871  tree = cut_useless_words(tree, 0, &dummy);
1872  }
1873 
1874 #ifdef DUMP_TREE
1875  printf("----------------------- tree with expanded branches:\n");
1876  dump_dtree(0, tree);
1877  printf("-----------------------\n");
1878 #endif
1879  testCounts(tree);
1880 
1881  return tree;
1882 }
1883 
1884 static DictTree remove_word_from_dtree(DictTree tree, cu_str wordStart, int wordLen, u_str resultBuffer, int *resultLen, long *resultFrequency, long *removed) {
1885  /* searches 'tree' for a word starting with 'wordStart' an removes it from the tree
1886  * if there are more than one possibilities, the returned word will be the one with the most occurrences
1887  * if there was no possibility -> resultLen==0, tree unchanged
1888  * otherwise: resultBuffer contains the word, returns new tree with word removed
1889  */
1890  long removed_single = 0;
1891  gb_assert(tree.exists);
1892  *removed = 0;
1893 
1894  if (wordLen) { // search wanted path into tree
1895  switch (tree.full->typ) {
1896  case SINGLE_NODE: {
1897  if (tree.single->ch==*wordStart) {
1898  *resultBuffer = *wordStart;
1899 
1900  if (tree.single->son.exists) {
1901  gb_assert(tree.single->count>0);
1902  tree.single->son = remove_word_from_dtree(tree.single->son, wordStart+1, wordLen-1,
1903  resultBuffer+1, resultLen, resultFrequency,
1904  &removed_single);
1905  if (*resultLen) { // word removed
1906  gb_assert(tree.single->count>=removed_single);
1907  tree.single->count -= removed_single;
1908  *removed += removed_single;
1909  (*resultLen)++;
1910  }
1911  }
1912  else {
1913  *resultLen = wordLen==1; // if wordLen==1 -> fully overlapping word found
1914  *resultFrequency = tree.single->count;
1915  }
1916 
1917  if (!tree.single->son.exists && *resultLen) { // if no son and a word was found -> remove branch
1918  DictTree brother = tree.single->brother;
1919 
1920  *removed += tree.single->count;
1921  gbm_free_mem(tree.single, sizeof(*tree.single), GBM_DICT_INDEX);
1922 
1923  if (brother.exists) tree = brother;
1924  else tree.exists = NULp;
1925  }
1926  }
1927  else if (tree.single->ch < *wordStart && tree.single->brother.exists) {
1928  tree.single->brother = remove_word_from_dtree(tree.single->brother, wordStart, wordLen,
1929  resultBuffer, resultLen, resultFrequency,
1930  &removed_single);
1931  if (*resultLen) *removed += removed_single;
1932  }
1933  else {
1934  *resultLen = 0; // not found
1935  }
1936 
1937  break;
1938  }
1939  case FULL_NODE: {
1940  unsigned_char ch = *wordStart;
1941  *resultBuffer = ch;
1942 
1943  if (tree.full->son[ch].exists) {
1944  tree.full->son[ch] = remove_word_from_dtree(tree.full->son[ch], wordStart+1, wordLen-1,
1945  resultBuffer+1, resultLen, resultFrequency,
1946  &removed_single);
1947  if (*resultLen) {
1948  if (tree.full->son[ch].exists) { // another son?
1949  tree.full->count[ch] -= removed_single;
1950  }
1951  else { // last son -> remove whole branch
1952  removed_single = tree.full->count[ch];
1953  tree.full->count[ch] = 0;
1954  tree.full->usedSons--;
1955  }
1956 
1957  *removed += removed_single;
1958  (*resultLen)++;
1959  }
1960  }
1961  else if (tree.full->count[ch]) {
1962  *resultLen = (wordLen==1);
1963 
1964  if (*resultLen) {
1965  *removed += removed_single = *resultFrequency = tree.full->count[ch];
1966  tree.full->count[ch] = 0;
1967  tree.full->usedSons--;
1968  }
1969  }
1970  else {
1971  *resultLen = 0; // not found
1972  }
1973 
1974  if (!tree.full->usedSons) {
1975  free_dtree(tree);
1976  tree.exists = NULp;
1977  }
1978 
1979  break;
1980  }
1981  }
1982  }
1983  else { // take any word
1984  switch (tree.full->typ) {
1985  case SINGLE_NODE: {
1986  *resultBuffer = tree.single->ch;
1987  gb_assert(tree.single->count>0);
1988 
1989  if (tree.single->son.exists) {
1990  tree.single->son = remove_word_from_dtree(tree.single->son, wordStart, wordLen,
1991  resultBuffer+1, resultLen, resultFrequency,
1992  &removed_single);
1993  gb_assert(*resultLen);
1994  (*resultLen)++;
1995  }
1996  else {
1997  *resultLen = 1;
1998  *resultFrequency = tree.single->count;
1999  removed_single = tree.single->count;
2000  }
2001 
2002  gb_assert(*resultFrequency>0);
2003 
2004  if (tree.single->son.exists) {
2005  gb_assert(tree.single->count>=removed_single);
2006  tree.single->count -= removed_single;
2007  *removed += removed_single;
2008  }
2009  else {
2010  DictTree brother = tree.single->brother;
2011 
2012  *removed += tree.single->count;
2013  gbm_free_mem(tree.single, sizeof(*tree.single), GBM_DICT_INDEX);
2014 
2015  if (brother.exists) tree = brother;
2016  else tree.exists = NULp;
2017  }
2018 
2019  break;
2020  }
2021  case FULL_NODE: {
2022  int idx;
2023 
2024  for (idx=0; idx<256; idx++) {
2025  if (tree.full->son[idx].exists) {
2026  *resultBuffer = idx;
2027  tree.full->son[idx] = remove_word_from_dtree(tree.full->son[idx], wordStart, wordLen,
2028  resultBuffer+1, resultLen, resultFrequency,
2029  &removed_single);
2030  gb_assert(*resultLen);
2031  (*resultLen)++;
2032 
2033  if (!tree.full->son[idx].exists) { // son branch removed -> zero count
2034  removed_single = tree.full->count[idx];
2035  tree.full->count[idx] = 0;
2036  tree.full->usedSons--;
2037  }
2038  else {
2039  tree.full->count[idx] -= removed_single;
2040  gb_assert(tree.full->count[idx]>0);
2041  }
2042 
2043  break;
2044  }
2045  else if (tree.full->count[idx]) {
2046  *resultBuffer = idx;
2047  *resultLen = 1;
2048  *resultFrequency = tree.full->count[idx];
2049  removed_single = tree.full->count[idx];
2050  tree.full->count[idx] = 0;
2051  tree.full->usedSons--;
2052  break;
2053  }
2054  }
2055 
2056  gb_assert(idx<256); // gb_assert break was used to exit loop (== node had a son)
2057 
2058  *removed += removed_single;
2059 
2060  if (!tree.full->usedSons) {
2061  free_dtree(tree);
2062  tree.exists = NULp;
2063  }
2064 
2065  break;
2066  }
2067  }
2068  }
2069 
2070 #ifdef DEBUG
2071  if (*resultLen) {
2072  gb_assert(*resultLen>0);
2073  gb_assert(*resultFrequency>0);
2074  gb_assert(*resultLen>=wordLen);
2075  }
2076 #endif
2077 
2078  return tree;
2079 }
2080 
2081 #define cmp(i1, i2) (heap2[i1]-heap2[i2])
2082 #define swap(i1, i2) do \
2083  { \
2084  int s = heap[i1]; \
2085  heap[i1] = heap[i2]; \
2086  heap[i2] = s; \
2087  \
2088  s = heap2[i1]; \
2089  heap2[i1] = heap2[i2]; \
2090  heap2[i2] = s; \
2091  } \
2092  while (0)
2093 
2094 static void downheap(int *heap, int *heap2, int me, int num) {
2095  int lson = me*2;
2096  int rson = lson+1;
2097 
2098  gb_assert(me>=1);
2099  if (lson>num) return;
2100 
2101  if (cmp(lson, me)<0) { // left son smaller than me? (we sort in descending order!!!)
2102  if (rson<=num && cmp(lson, rson)>0) { // right son smaller than left son?
2103  swap(me, rson);
2104  downheap(heap, heap2, rson, num);
2105  }
2106  else {
2107  swap(me, lson);
2108  downheap(heap, heap2, lson, num);
2109  }
2110  }
2111  else if (rson<=num && cmp(me, rson)>0) { // right son smaller than me?
2112  swap(me, rson);
2113  downheap(heap, heap2, rson, num);
2114  }
2115 }
2116 
2117 #undef cmp
2118 #undef swap
2119 
2120 
2121 
2122 #define cmp(i1, i2) GB_MEMCMP(dict->text+dict->offsets[heap[i1]], dict->text+dict->offsets[heap[i2]], dict->textlen)
2123 #define swap(i1, i2) do { int s = heap[i1]; heap[i1] = heap[i2]; heap[i2] = s; } while (0)
2124 
2125 static void downheap2(int *heap, GB_DICTIONARY *dict, int me, int num) {
2126  int lson = me*2;
2127  int rson = lson+1;
2128 
2129  gb_assert(me>=1);
2130  if (lson>num) return;
2131 
2132  if (cmp(lson, me)>0) { // left son bigger than me?
2133  if (rson<=num && cmp(lson, rson)<0) { // right son bigger than left son?
2134  swap(me, rson);
2135  downheap2(heap, dict, rson, num);
2136  }
2137  else {
2138  swap(me, lson);
2139  downheap2(heap, dict, lson, num);
2140  }
2141  }
2142  else if (rson<=num && cmp(me, rson)<0) { // right son bigger than me?
2143  swap(me, rson);
2144  downheap2(heap, dict, rson, num);
2145  }
2146 }
2147 
2148 #undef cmp
2149 #undef swap
2150 
2151 static void sort_dict_offsets(GB_DICTIONARY *dict) {
2152  /* 1. sorts the 'dict->offsets' by frequency
2153  * (frequency of each offset is stored in the 'dict->resort' with the same index)
2154  * 2. initializes & sorts 'dict->resort' in alphabetic order
2155  */
2156  int i;
2157  int num = dict->words;
2158  int *heap = dict->offsets-1;
2159  int *heap2 = dict->resort-1;
2160 
2161  // sort offsets
2162 
2163  for (i=num/2; i>=1; i--) downheap(heap, heap2, i, num); // make heap
2164 
2165  while (num>1) { // sort heap
2166  int big = heap[1];
2167  int big2 = heap2[1];
2168 
2169  heap[1] = heap[num];
2170  heap2[1] = heap2[num];
2171 
2172  downheap(heap, heap2, 1, num-1);
2173 
2174  heap[num] = big;
2175  heap2[num] = big2;
2176 
2177  num--;
2178  }
2179 
2180  // initialize dict->resort
2181 
2182  for (i=0, num=dict->words; i<num; i++) dict->resort[i] = i; // LOOP_VECTORIZED
2183 
2184  // sort dictionary alphabetically
2185 
2186  for (i=num/2; i>=1; i--) downheap2(heap2, dict, i, num); // make heap
2187 
2188  while (num>1) {
2189  int big = heap2[1];
2190 
2191  heap2[1] = heap2[num];
2192  downheap2(heap2, dict, 1, num-1);
2193  heap2[num] = big;
2194  num--;
2195  }
2196 }
2197 
2198 // Warning dictionary is not in network byte order !!!!
2199 static GB_DICTIONARY *gb_create_dictionary(O_gbdByKey *gbk, long maxmem) {
2200  long data_sum;
2201  DictTree tree = build_dict_tree(gbk, maxmem, MAX_WORD_LEN, MIN_WORD_LEN, &data_sum);
2202 
2203  if (tree.exists) {
2204  GB_DICTIONARY *dict = (GB_DICTIONARY*)gbm_get_mem(sizeof(*dict), GBM_DICT_INDEX);
2205  int maxdeep = 0;
2206  int words = count_dtree_leafs(tree, 0, &maxdeep);
2207  u_str word;
2208 
2209  int wordLen;
2210  long wordFrequency;
2211  int offset = 0; // next free position in dict->text
2212  int overlap = 0; // # of bytes overlapping with last word
2213  u_str buffer;
2214  long dummy;
2215 #if defined(DEBUG)
2216  long word_sum = 0;
2217  long overlap_sum = 0;
2218  long max_overlap = 0;
2219 #endif
2220 
2221  // reduce tree as long as it has to many leafs (>MAX_LONG_INDEX)
2222  while (words >= MAX_LONG_INDEX) {
2223 
2224  words = count_dtree_leafs(tree, 0, &maxdeep);
2225  }
2226 
2227  buffer = (u_str)gbm_get_mem(maxdeep, GBM_DICT_INDEX);
2228 
2229  calcCounts(tree);
2230  testCounts(tree);
2231 
2232 #if DEBUG
2233  printf(" examined data was %li bytes\n", data_sum);
2234  printf(" tree contains %i words *** maximum tree depth = %i\n", words, maxdeep);
2235 #endif
2236 
2237  dict->words = 0;
2238  dict->textlen = DICT_STRING_INCR;
2239 
2241  dict->offsets = (GB_NINT*)gbm_get_mem(sizeof(*(dict->offsets))*words, GBM_DICT_INDEX);
2242  dict->resort = (GB_NINT*)gbm_get_mem(sizeof(*(dict->resort))*words, GBM_DICT_INDEX);
2243 
2244  memset(buffer, '*', maxdeep);
2245  tree = remove_word_from_dtree(tree, NULp, 0, buffer, &wordLen, &wordFrequency, &dummy);
2246  testCounts(tree);
2247 
2248  while (1) {
2249  int nextWordLen = 0;
2250  int len;
2251 
2252 #if DUMP_COMPRESSION_TEST>=4
2253  printf("word='%s' (occur=%li overlap=%i)\n", lstr(buffer, wordLen), wordFrequency, overlap);
2254 #endif
2255 
2256 #if defined(DEBUG)
2257  overlap_sum += overlap;
2258  if (overlap>max_overlap) max_overlap = overlap;
2259  word_sum += wordLen;
2260 #endif
2261 
2262  if (offset-overlap+wordLen > dict->textlen) { // if not enough space allocated -> reallocate dictionary string
2264 
2265  memcpy(ntext, dict->text, dict->textlen);
2266  gbm_free_mem(dict->text, dict->textlen, GBM_DICT_INDEX);
2267 
2268  dict->text = ntext;
2269  dict->textlen += DICT_STRING_INCR;
2270  }
2271 
2272  dict->offsets[dict->words] = offset-overlap;
2273  dict->resort[dict->words] = wordFrequency; // temporarily miss-use this to store frequency
2274  dict->words++;
2275 
2276  word = dict->text+offset-overlap;
2277  gb_assert(overlap==0 || GB_MEMCMP(word, buffer, overlap)==0); // test overlapping string-part
2278  memcpy(word, buffer, wordLen); // word -> dictionary string
2279  offset += wordLen-overlap;
2280 
2281  if (!tree.exists) break;
2282 
2283  for (len=min(10, wordLen-1); len>=0 && nextWordLen==0; len--) {
2284  memset(buffer, '*', maxdeep);
2285  tree = remove_word_from_dtree(tree, word+wordLen-len, len, buffer, &nextWordLen, &wordFrequency, &dummy);
2286  overlap = len;
2287  }
2288 
2289  wordLen = nextWordLen;
2290  }
2291 
2292  gb_assert(dict->words <= MAX_LONG_INDEX);
2293  gb_assert(dict->words==words); /* dict->words == # of words stored in dictionary string
2294  * words == # of words pre-calculated */
2295 
2296 #if DEBUG
2297  printf(" word_sum=%li overlap_sum=%li (%li%%) max_overlap=%li\n",
2298  word_sum, overlap_sum, (overlap_sum*100)/word_sum, max_overlap);
2299 #endif
2300 
2301  if (offset<dict->textlen) { // reallocate dict->text if it was allocated too large
2302  u_str ntext = (u_str)gbm_get_mem(offset, GBM_DICT_INDEX);
2303 
2304  memcpy(ntext, dict->text, offset);
2305  gbm_free_mem(dict->text, dict->textlen, GBM_DICT_INDEX);
2306 
2307  dict->text = ntext;
2308  dict->textlen = offset;
2309  }
2310 
2311  sort_dict_offsets(dict);
2312 
2313  gbm_free_mem(buffer, maxdeep, GBM_DICT_INDEX);
2314  free_dtree(tree);
2315 
2316  return dict;
2317  }
2318 
2319  return NULp;
2320 }
2321 
2322 static void gb_free_dictionary(GB_DICTIONARY*& dict) {
2323  gbm_free_mem(dict->text, dict->textlen, GBM_DICT_INDEX);
2324  gbm_free_mem(dict->offsets, sizeof(*(dict->offsets))*dict->words, GBM_DICT_INDEX);
2325  gbm_free_mem(dict->resort, sizeof(*(dict->resort))*dict->words, GBM_DICT_INDEX);
2326 
2327  gbm_free_mem(dict, sizeof(*dict), GBM_DICT_INDEX);
2328  dict = NULp;
2329 }
2330 
2331 static GB_ERROR readAndWrite(O_gbdByKey *gbkp, size_t& old_size, size_t& new_size) {
2332  GB_ERROR error = NULp;
2333 
2334  old_size = 0;
2335  new_size = 0;
2336 
2337  for (int i=0; i<gbkp->cnt && !error; i++) {
2338  GBDATA *gbd = gbkp->gbds[i];
2339 
2340  if (COMPRESSIBLE(gbd->type())) {
2341  size_t size;
2342  char *data;
2343 
2344  {
2345  cu_str d = (cu_str)get_data_n_size(gbd, &size);
2346  old_size += gbd->as_entry()->memsize();
2347 
2348  data = (char*)gbm_get_mem(size, GBM_DICT_INDEX);
2349  memcpy(data, d, size);
2350  gb_assert(data[size-1] == 0);
2351  }
2352 
2353  switch (gbd->type()) {
2354  case GB_STRING:
2355  error = GB_write_string(gbd, "");
2356  if (!error) error = GB_write_string(gbd, data);
2357  break;
2358  case GB_BYTES:
2359  error = GB_write_bytes(gbd, NULp, 0);
2360  if (!error) error = GB_write_bytes(gbd, data, size);
2361  break;
2362  case GB_INTS:
2363  error = GB_write_ints(gbd, (GB_UINT4 *)NULp, 0);
2364  if (!error) error = GB_write_ints(gbd, (GB_UINT4 *)data, size);
2365  break;
2366  case GB_FLOATS:
2367  error = GB_write_floats(gbd, (float*)NULp, 0);
2368  if (!error) error = GB_write_floats(gbd, (float*)(void*)data, size);
2369  break;
2370  default:
2371  gb_assert(0);
2372  break;
2373  }
2374 
2375  new_size += gbd->as_entry()->memsize();
2376 
2377  gbm_free_mem(data, size, GBM_DICT_INDEX);
2378  }
2379  }
2380  return error;
2381 }
2382 
2383 static GB_ERROR gb_create_dictionaries(GB_MAIN_TYPE *Main, long maxmem) {
2384  GB_ERROR error = NULp;
2385 #if defined(TEST_DICT)
2386  long uncompressed_sum = 0;
2387  long compressed_sum = 0;
2388 #endif // TEST_DICT
2389 
2390  printf("Creating GBDATA-Arrays..\n");
2391 
2392  if (!error) {
2393  O_gbdByKey *gbk = g_b_opti_createGbdByKey(Main);
2394  int idx = -1;
2395 
2396  printf("Creating dictionaries..\n");
2397 
2398 #ifdef DEBUG
2399  // #define TEST_ONE // test only key specified below
2400  // #define TEST_SOME // test only some keys specified below
2401 #if defined(TEST_ONE)
2402  // select wanted index
2403  for (idx=0; idx<gbdByKey_cnt; idx++) { // title author dew_author ebi_journal name ua_tax date full_name ua_title
2404  if (gbk[idx].cnt && strcmp(quark2key(Main, idx), "tree")==0) break;
2405  }
2406  gb_assert(idx<gbdByKey_cnt);
2407 #endif
2408 #endif
2409 
2410 #ifdef TEST_ONE
2411  // only create dictionary for index selected above (no loop)
2412 #else
2413  // create dictionaries for all indices (this is the normal operation)
2414  arb_progress progress("Optimizing key data", long(gbdByKey_cnt-1));
2415  for (idx = gbdByKey_cnt-1; idx >= 1 && !error; --idx, progress.inc_and_check_user_abort(error))
2416 #endif
2417 
2418  {
2419  GB_DICTIONARY *dict;
2420 
2421  GB_CSTR key_name = quark2key(Main, idx);
2422  GBDATA *gb_main = Main->gb_main();
2423 
2424 #ifdef TEST_SOME
2425  if (!( // add all wanted keys here
2426  strcmp(key_name, "REF") == 0 ||
2427  strcmp(key_name, "ref") == 0
2428  )) continue;
2429 #endif // TEST_SOME
2430 
2431 #ifndef TEST_ONE
2432  if (!gbk[idx].cnt) continue; // there are no entries with this quark
2433 
2434  GB_TYPES type = gbk[idx].gbds[0]->type();
2435 
2436  GB_begin_transaction(gb_main);
2437  int compression_mask = gb_get_compression_mask(Main, idx, type);
2438  GB_commit_transaction(gb_main);
2439 
2440  if ((compression_mask & GB_COMPRESSION_DICTIONARY) == 0) continue; // compression with dictionary is not allowed
2441  if (strcmp(key_name, "data") == 0) continue;
2442  if (strcmp(key_name, "quality") == 0) continue;
2443 #endif
2444 
2445  printf("- dictionary for '%s' (idx=%i)\n", key_name, idx);
2446  GB_begin_transaction(gb_main);
2447  dict = gb_create_dictionary(&(gbk[idx]), maxmem);
2448 
2449  if (dict) {
2450  /* decompress with old dictionary and write
2451  all data of actual type without compression: */
2452 
2453  printf(" * Uncompressing all with old dictionary ...\n");
2454 
2455  size_t old_compressed_size;
2456 
2457  {
2458  int& compr_mask = Main->keys[idx].compression_mask;
2459  int old_compr_mask = compr_mask;
2460 
2461  size_t new_size;
2462 
2463  compr_mask &= ~GB_COMPRESSION_DICTIONARY;
2464  error = readAndWrite(&gbk[idx], old_compressed_size, new_size);
2465  compr_mask = old_compr_mask;
2466  }
2467 
2468  if (!error) {
2469  /* dictionary is saved in the following format:
2470  *
2471  * GB_NINT size
2472  * GB_NINT offsets[dict->words]
2473  * GB_NINT resort[dict->words]
2474  * char *text
2475  */
2476 
2477  int dict_buffer_size = sizeof(GB_NINT) * (1+dict->words*2) + dict->textlen;
2478  char *dict_buffer = (char*)gbm_get_mem(dict_buffer_size, GBM_DICT_INDEX);
2479  long old_dict_buffer_size;
2480  char *old_dict_buffer;
2481 
2482  {
2483  GB_NINT *nint = (GB_NINT*)dict_buffer;
2484  int n;
2485 
2486  *nint++ = htonl(dict->words);
2487  for (n=0; n<dict->words; n++) *nint++ = htonl(dict->offsets[n]);
2488  for (n=0; n<dict->words; n++) *nint++ = htonl(dict->resort[n]);
2489 
2490  memcpy(nint, dict->text, dict->textlen);
2491  }
2492 
2493  const char *key = Main->keys[idx].key;
2494 
2495  error = gb_load_dictionary_data(gb_main, key, &old_dict_buffer, &old_dict_buffer_size);
2496  if (!error) {
2497  gb_save_dictionary_data(gb_main, key, dict_buffer, dict_buffer_size);
2498 
2499  // compress all data with new dictionary
2500  printf(" * Compressing all with new dictionary ...\n");
2501 
2502  size_t old_size, new_compressed_size;
2503  error = readAndWrite(&gbk[idx], old_size, new_compressed_size);
2504 
2505  if (!error) {
2506  printf(" (compressed size: old=%zu new=%zu ratio=%.1f%%)\n",
2507  old_compressed_size, new_compressed_size, (new_compressed_size*100.0)/old_compressed_size);
2508 
2509  // @@@ for some keys compression fails (e.g. 'ref');
2510  // need to find out why dictionary is so bad
2511  // gb_assert(new_compressed_size <= old_compressed_size); // @@@ enable this (fails in unit-test)
2512  }
2513 
2514  if (error) {
2515  /* critical state: new dictionary has been written, but transaction will be aborted below.
2516  * Solution: Write back old dictionary.
2517  */
2518  gb_save_dictionary_data(gb_main, key, old_dict_buffer, old_dict_buffer_size);
2519  }
2520  }
2521 
2522  gbm_free_mem(dict_buffer, dict_buffer_size, GBM_DICT_INDEX);
2523  if (old_dict_buffer) gbm_free_mem(old_dict_buffer, old_dict_buffer_size, GBM_DICT_INDEX);
2524 
2525 #if defined(TEST_DICT)
2526  if (!error) {
2527  GB_DICTIONARY *dict_reloaded = gb_get_dictionary(Main, idx);
2528  test_dictionary(dict_reloaded, &(gbk[idx]), &uncompressed_sum, &compressed_sum);
2529  }
2530 #endif // TEST_DICT
2531  }
2532 
2533  gb_free_dictionary(dict);
2534  }
2535 
2536  error = GB_end_transaction(gb_main, error);
2537  }
2538 
2539 #ifdef TEST_DICT
2540  if (!error) {
2541  printf(" overall uncompressed size = %li b\n"
2542  " overall compressed size = %li b (Ratio=%li%%)\n",
2543  uncompressed_sum, compressed_sum,
2544  (compressed_sum*100)/uncompressed_sum);
2545  }
2546 #endif // TEST_DICT
2547 
2548  printf("Done.\n");
2549 
2550  g_b_opti_freeGbdByKey(gbk);
2551  }
2552 
2553  return error;
2554 }
2555 
2557  unsigned long maxKB = GB_get_usable_memory();
2558  long maxMem;
2559  GB_ERROR error = NULp;
2560  GB_UNDO_TYPE prev_undo_type = GB_get_requested_undo_type(gb_main);
2561 
2562 #ifdef DEBUG
2563  maxKB /= 2;
2564 #endif
2565 
2566  if (maxKB<=(LONG_MAX/1024)) maxMem = maxKB*1024;
2567  else maxMem = LONG_MAX;
2568 
2569  error = GB_request_undo_type(gb_main, GB_UNDO_KILL);
2570  if (!error) {
2571  error = gb_create_dictionaries(GB_MAIN(gb_main), maxMem);
2572  if (!error) GB_disable_quicksave(gb_main, "Database optimized");
2573  ASSERT_NO_ERROR(GB_request_undo_type(gb_main, prev_undo_type));
2574  }
2575  return error;
2576 }
2577 
2578 // --------------------------------------------------------------------------------
2579 
2580 #ifdef UNIT_TESTS
2581 #ifndef TEST_UNIT_H
2582 #include <test_unit.h>
2583 #endif
2584 
2585 // #define TEST_AUTO_UPDATE // uncomment to auto-update binary result of DB optimization
2586 // #define TEST_AUTO_UPDATE_ASCII // uncomment to auto-update ascii-input from output (be careful with this!)
2587 
2588 void TEST_SLOW_optimize() {
2589  // test DB optimization (optimizes compression)
2590  // [current coverage ~89%]
2591  //
2592  // Note: a test for sequence compression is in adseqcompr.cxx@TEST_SLOW_sequence_compression
2593 
2594  GB_shell shell;
2595 
2596  const char *source_ascii = "TEST_opti_ascii_in.arb";
2597  const char *target_ascii = "TEST_opti_ascii_out.arb";
2598 
2599  const char *nonopti = "TEST_opti_none.arb";
2600  const char *optimized = "TEST_opti_initial.arb";
2601  const char *reoptimized = "TEST_opti_again.arb";
2602  const char *expected = "TEST_opti_expected.arb"; // expected result after optimize
2603 
2604  // initial optimization of ASCII-DB
2605  {
2606  GBDATA *gb_main = GB_open(source_ascii, "rw");
2607 
2608  TEST_EXPECT_NO_ERROR(GB_save_as(gb_main, nonopti, "b"));
2609 
2610  {
2611  GB_topSecurityLevel unsecured(gb_main);
2613  }
2614 
2615  GB_flush_cache(gb_main);
2616 
2617  TEST_EXPECT_NO_ERROR(GB_save_as(gb_main, optimized, "b"));
2618  TEST_EXPECT_NO_ERROR(GB_save_as(gb_main, target_ascii, "a"));
2619 
2620 #if defined(TEST_AUTO_UPDATE)
2621  TEST_COPY_FILE(optimized, expected);
2622 #endif
2623 #if defined(TEST_AUTO_UPDATE_ASCII)
2624  TEST_COPY_FILE(target_ascii, source_ascii);
2625 #endif
2626 
2627  TEST_EXPECT_TEXTFILE_DIFFLINES(source_ascii, target_ascii, 0);
2628  TEST_EXPECT_FILES_EQUAL(optimized, expected);
2629 
2630  // check that optimization made sense:
2631  long nonopti_size = GB_size_of_file(nonopti);
2632  long optimized_size = GB_size_of_file(optimized);
2633  TEST_EXPECT_LESS(optimized_size, nonopti_size); // did file shrink?
2634  TEST_EXPECT_EQUAL(optimized_size*100/nonopti_size, 74); // document compression ratio (in percent)
2635 
2636  GB_close(gb_main);
2637  }
2638 
2639  // re-optimize DB (which is already compressed)
2640  {
2641  GBDATA *gb_main = GB_open(optimized, "rw");
2642 
2643  {
2644  GB_topSecurityLevel unsecured(gb_main);
2646  }
2647 
2648  TEST_EXPECT_NO_ERROR(GB_save_as(gb_main, reoptimized, "b"));
2649  TEST_EXPECT_NO_ERROR(GB_save_as(gb_main, target_ascii, "a"));
2650 
2651  TEST_EXPECT_TEXTFILE_DIFFLINES(source_ascii, target_ascii, 0);
2652  TEST_EXPECT_FILES_EQUAL(reoptimized, expected); // reoptimize should produce same result as initial optimize
2653 
2654  GB_close(gb_main);
2655  }
2656 
2657  TEST_EXPECT_NO_ERROR(GBK_system(GBS_global_string("rm %s %s %s %s", target_ascii, nonopti, optimized, reoptimized)));
2658 }
2659 
2660 void TEST_ticket_742() {
2661  // reproduce problem reported in ticket #742
2662  GB_shell shell;
2663  {
2664  const char *opti_db = "TEST_opti_expected.arb"; // expected result after optimize
2665  GBDATA *gb_main = GBT_open(opti_db, "rw"); // use GBT_open to create index for 'name'
2666 
2667  for (int suf = 2; suf<=4; ++suf) {
2668  char *sainame = GBS_global_string_copy("forcecompress_%i", suf);
2669  char *mod_sainame = GBS_global_string_copy("%s.mod", sainame);
2670 
2671  GBDATA *gb_sai_name;
2672  {
2673  GB_transaction ta(gb_main);
2674 
2675  GBDATA *gb_sai = GBT_find_SAI(gb_main, sainame); TEST_REJECT_NULL(gb_sai);
2676  gb_sai_name = GB_entry(gb_sai, "name"); TEST_REJECT_NULL(gb_sai_name);
2677 
2678  TEST_EXPECT_EQUAL(GB_read_char_pntr(gb_sai_name), sainame);
2679 
2680  TEST_EXPECT_NO_ERROR(GB_write_string(gb_sai_name, mod_sainame));
2681  TEST_EXPECT_EQUAL(GB_read_char_pntr(gb_sai_name), mod_sainame); // fixed
2682 
2683  switch (suf) {
2684  case 2:
2685  // do nothing special (only test value in next TA below)
2686  break;
2687  case 3:
2688  // flushing the cache "solves" the problem => GB_write_string does not flush correctly
2689  GB_flush_cache(gb_sai_name);
2690  TEST_EXPECT_EQUAL(GB_read_char_pntr(gb_sai_name), mod_sainame);
2691  break;
2692  case 4:
2693  // writing twice also "solves" the problem
2694  TEST_EXPECT_NO_ERROR(GB_write_string(gb_sai_name, mod_sainame));
2695  TEST_EXPECT_EQUAL(GB_read_char_pntr(gb_sai_name), mod_sainame);
2696  break;
2697 
2698 
2699  default:
2700  TEST_REJECT(true);
2701  break;
2702  }
2703  }
2704 
2705  {
2706  GB_transaction ta(gb_main);
2707  TEST_EXPECT_EQUAL(GB_read_char_pntr(gb_sai_name), mod_sainame); // ok in next TA
2708  }
2709 
2710  free(mod_sainame);
2711  free(sainame);
2712  }
2713 
2714  {
2715  GB_transaction ta(gb_main);
2716 
2717  GBDATA *gb_species = GBT_find_species(gb_main, "FrnPhilo"); TEST_REJECT_NULL(gb_species);
2718  GBDATA *gb_data = GB_create(gb_species, "data", GB_STRING); TEST_REJECT_NULL(gb_data);
2719 
2720  const char *seq1 = ".....................AUUCUGGUU-----GA--U-CC-U-G------------..............";
2721  const char *seq2 = "........A-A--CU---------------C-A-A-A-G-GA-G--AC---A-C-U-G...............";
2722 
2723  TEST_EXPECT_NO_ERROR(GB_write_string(gb_data, seq1));
2724  TEST_EXPECT_EQUAL(GB_read_char_pntr(gb_data), seq1);
2726  TEST_EXPECT_EQUAL(gb_data->flags2.extern_data, 1);
2727 
2728  TEST_EXPECT_NO_ERROR(GB_write_string(gb_data, seq2));
2729  TEST_EXPECT_EQUAL(GB_read_char_pntr(gb_data), seq2);
2731  TEST_EXPECT_EQUAL(gb_data->flags2.extern_data, 1);
2732 
2734 
2735  TEST_EXPECT_NO_ERROR(GB_write_string(gb_data, seq1));
2736  TEST_EXPECT_EQUAL(GB_read_char_pntr(gb_data), seq1); // fixed
2737 
2738  TEST_EXPECT_NO_ERROR(GB_write_string(gb_data, seq2));
2739  TEST_EXPECT_EQUAL(GB_read_char_pntr(gb_data), seq2); // seems to work ...
2740 
2741  GB_flush_cache(gb_data);
2742  TEST_EXPECT_EQUAL(GB_read_char_pntr(gb_data), seq2); // fixed
2743  }
2744 
2745  GB_close(gb_main);
2746  }
2747 }
2748 
2749 void TEST_AFTER_SLOW_streamed_ascii_save_asUsedBy_silva_pipeline() { // run after TEST_SLOW_loadsave!
2750  GB_shell shell;
2751 
2752  const char *loadname = "TEST_loadsave_ascii.arb";
2753  const char *savename = "TEST_streamsaved.arb";
2754 
2755  {
2756  GBDATA *gb_main1 = GB_open(loadname, "r"); TEST_REJECT_NULL(gb_main1);
2757  GBDATA *gb_main2 = GB_open(loadname, "rw"); TEST_REJECT_NULL(gb_main2);
2758 
2759  // delete all species from DB2
2760  GBDATA *gb_species_data2 = NULp;
2761  {
2762  GB_transaction ta(gb_main2);
2763  GB_topSecurityLevel unsecured(gb_main2);
2764  gb_species_data2 = GBT_get_species_data(gb_main2);
2765  for (GBDATA *gb_species = GBT_first_species_rel_species_data(gb_species_data2);
2766  gb_species;
2767  gb_species = GBT_next_species(gb_species))
2768  {
2769  TEST_EXPECT_NO_ERROR(GB_delete(gb_species));
2770  }
2771  }
2772 
2773  for (int saveWhileTransactionOpen = 0; saveWhileTransactionOpen<=1; ++saveWhileTransactionOpen) {
2774  {
2775  ArbDBWriter *writer = NULp;
2776  TEST_EXPECT_NO_ERROR(GB_start_streamed_save_as(gb_main2, savename, "a", writer));
2777  TEST_EXPECT_NO_ERROR(GB_stream_save_part(writer, gb_main2, gb_species_data2));
2778 
2779  {
2780  GB_transaction ta1(gb_main1);
2781  GB_transaction ta2(gb_main2);
2782 
2783  for (GBDATA *gb_species1 = GBT_first_species(gb_main1);
2784  gb_species1;
2785  gb_species1 = GBT_next_species(gb_species1))
2786  {
2787  GBDATA *gb_species2 = GB_create_container(gb_species_data2, "species");
2788  if (!gb_species2) {
2790  }
2791  else {
2792  TEST_EXPECT_NO_ERROR(GB_copy_dropMarksAndTempstate(gb_species2, gb_species1));
2793  GB_write_flag(gb_species2, GB_read_flag(gb_species1)); // copy marked flag
2794  }
2795 
2796  if (!saveWhileTransactionOpen) GB_commit_transaction(gb_main2);
2797  TEST_EXPECT_NO_ERROR(GB_stream_save_part(writer, gb_species2, gb_species2));
2798  if (!saveWhileTransactionOpen) GB_begin_transaction(gb_main2);
2799 
2800  GB_topSecurityLevel unsecured(gb_main2);
2801  TEST_EXPECT_NO_ERROR(GB_delete(gb_species2));
2802  }
2803  }
2804 
2805  TEST_EXPECT_NO_ERROR(GB_stream_save_part(writer, gb_species_data2, gb_main2));
2807  }
2808 
2809  // test file content
2810  TEST_EXPECT_TEXTFILES_EQUAL(savename, loadname); // (Note: if this fails, run auto-update in TEST_SLOW_loadsave)
2812  }
2813 
2814  // test some error cases:
2815  {
2816  // 1. stream binary (not supported)
2817  ArbDBWriter *writer = NULp;
2818  TEST_EXPECT_NO_ERROR(GB_start_streamed_save_as(gb_main2, savename, "b", writer));
2819  TEST_EXPECT_ERROR_CONTAINS(GB_stream_save_part(writer, gb_main2, gb_species_data2), "only supported for ascii");
2820  GB_finish_stream_save(writer);
2821  TEST_EXPECT(!GB_is_regularfile(savename)); // no partial file remains
2822 
2823  // 2. invalid use of GB_stream_save_part (not ancestors)
2824  TEST_EXPECT_NO_ERROR(GB_start_streamed_save_as(gb_main2, savename, "a", writer));
2825  TEST_EXPECT_NO_ERROR(GB_stream_save_part(writer, gb_main2, gb_species_data2));
2826  GBDATA *gb_extended_data2;
2827  {
2828  GB_transaction ta(gb_main2);
2829  gb_extended_data2 = GBT_get_SAI_data(gb_main2);
2830  }
2831  TEST_EXPECT_ERROR_CONTAINS(GB_stream_save_part(writer, gb_species_data2, gb_extended_data2), "has to be an ancestor");
2832  GB_finish_stream_save(writer);
2833  TEST_EXPECT(!GB_is_regularfile(savename)); // no partial file remains
2834 
2835  }
2836  // TEST_EXPECT_ZERO_OR_SHOW_ERRNO(GB_unlink(savename));
2837 
2838  GB_close(gb_main2);
2839  GB_close(gb_main1);
2840  }
2841 }
2842 
2843 #endif // UNIT_TESTS
2844 
2845 // --------------------------------------------------------------------------------
2846 
GB_ERROR GB_begin_transaction(GBDATA *gbd)
Definition: arbdb.cxx:2528
GB_ERROR GBK_system(const char *system_command)
Definition: arb_msg.cxx:571
int INDEX_DICT_OFFSET(int idx, GB_DICTIONARY *dict)
Definition: adoptimize.cxx:298
static DictTree build_dict_tree(O_gbdByKey *gbk, long maxmem, long maxdeep, size_t minwordlen, long *data_sum)
const char * GB_ERROR
Definition: arb_core.h:25
#define GB_SYSTEM_FOLDER
Definition: arbdb.h:27
GBDATA * GB_open(const char *path, const char *opent)
Definition: ad_load.cxx:1363
GB_TYPES type
GB_ERROR GB_commit_transaction(GBDATA *gbd)
Definition: arbdb.cxx:2551
static void downheap(int *heap, int *heap2, int me, int num)
void GB_warning(const char *message)
Definition: arb_msg.cxx:530
GB_ERROR GB_start_streamed_save_as(GBDATA *gbd, const char *path, const char *savetype, ArbDBWriter *&writer)
GBDATA * GB_child(GBDATA *father)
Definition: adquery.cxx:322
GB_ERROR GB_write_bytes(GBDATA *gbd, const char *s, long size)
Definition: arbdb.cxx:1434
static DictTree single2full_dtree(DictTree tree, long *memcount)
#define GETVAL(tag, typ)
Definition: adoptimize.cxx:321
int count[256]
Definition: adoptimize.cxx:54
GB_ERROR GB_write_string(GBDATA *gbd, const char *s)
Definition: arbdb.cxx:1387
#define MAX_LONG_INDEX
Definition: adoptimize.cxx:335
const char * quark2key(GB_MAIN_TYPE *Main, GBQUARK key_quark)
Definition: gb_key.h:45
gb_flag_types2 flags2
Definition: gb_data.h:135
#define ASSERT_NO_ERROR(errorExpr)
Definition: arb_assert.h:360
GB_MAIN_TYPE * GB_MAIN(GBDATA *gbd)
Definition: gb_data.h:291
static GB_ERROR readAndWrite(O_gbdByKey *gbkp, size_t &old_size, size_t &new_size)
DictNodeType typ
Definition: adoptimize.cxx:52
static int searchWord(GB_DICTIONARY *dict, cu_str source, long size, unsigned long *wordIndex, int *wordLen)
Definition: adoptimize.cxx:435
GB_BUFFER gb_uncompress_longs_old(GB_CBUFFER source, size_t size, size_t *new_size)
Definition: adcompr.cxx:772
long nref
Definition: gb_key.h:31
DictNodeType typ
Definition: adoptimize.cxx:59
static GB_ERROR gb_convert_compression(GBDATA *gbd)
Definition: adoptimize.cxx:178
GB_ERROR GB_end_transaction(GBDATA *gbd, GB_ERROR error)
Definition: arbdb.cxx:2561
size_t memsize() const
Definition: gb_data.h:215
unsigned char * text
Definition: gb_dict.h:19
unsigned char unsigned_char
Definition: adoptimize.cxx:29
void GB_disable_quicksave(GBDATA *gbd, const char *reason)
Definition: arbdb.cxx:2647
GB_ERROR GB_write_pntr(GBDATA *gbd, const char *s, size_t bytes_size, size_t stored_size)
Definition: arbdb.cxx:1344
static int expandBranches(u_str buffer, int deep, int minwordlen, int maxdeep, DictTree tree, DictTree root, int max_percent)
GB_ERROR gb_convert_V2_to_V3(GBDATA *gb_main)
Definition: adoptimize.cxx:259
GB_ERROR GB_stream_save_part(ArbDBWriter *writer, GBDATA *from, GBDATA *till)
#define MIN_COMPR_WORD_LEN
Definition: adoptimize.cxx:331
#define test_dtree(tree)
const char * GBS_global_string(const char *templat,...)
Definition: arb_msg.cxx:203
DictNodeType
Definition: adoptimize.cxx:49
int GB_unlink(const char *path)
Definition: arb_file.cxx:188
int ALPHA_DICT_OFFSET(int idx, GB_DICTIONARY *dict)
Definition: adoptimize.cxx:302
GB_TYPES type() const
Definition: gb_data.h:139
static GB_DICTIONARY * gb_create_dictionary(O_gbdByKey *gbk, long maxmem)
long GB_size_of_file(const char *path)
Definition: arb_file.cxx:28
static DictTree add_to_dtree(DictTree tree, cu_str text, long len, long *memcount)
#define LAST_COMPRESSED_BIT
Definition: adoptimize.cxx:337
Definition: gb_key.h:28
char buffer[MESSAGE_BUFFERSIZE]
Definition: seq_search.cxx:34
#define INDEX_LEN_SHIFT
Definition: adoptimize.cxx:318
unsigned long GB_ULONG
Definition: arbdb_base.h:42
GBDATA * GBT_open(const char *path, const char *opent)
Definition: adtools.cxx:524
#define cmp(i1, i2)
GB_ERROR GB_delete(GBDATA *&source)
Definition: arbdb.cxx:1916
GBDATA * GBT_first_species_rel_species_data(GBDATA *gb_species_data)
Definition: aditem.cxx:121
GB_BUFFER GB_give_other_buffer(GB_CBUFFER buffer, long size)
Definition: arbdb.cxx:363
DictTree brother
Definition: adoptimize.cxx:63
GB_BUFFER gb_uncompress_bytes(GB_CBUFFER source, size_t size, size_t *new_size)
Definition: adcompr.cxx:761
static GB_ERROR gb_create_dictionaries(GB_MAIN_TYPE *Main, long maxmem)
GB_NINT * offsets
Definition: gb_dict.h:20
#define MAX_BROTHERS
Definition: adoptimize.cxx:84
GB_UNDO_TYPE
Definition: arbdb.h:107
GBDATA * GBT_find_SAI(GBDATA *gb_main, const char *name)
Definition: aditem.cxx:177
int gb_get_compression_mask(GB_MAIN_TYPE *Main, GBQUARK key, int gb_type)
Definition: arbdb.cxx:1329
unsigned int GB_UINT4
Definition: arbdb_base.h:37
DictTree son
Definition: adoptimize.cxx:62
GB_ERROR GB_export_error(const char *error)
Definition: arb_msg.cxx:257
GB_CSTR GB_read_bytes_pntr(GBDATA *gbd)
Definition: arbdb.cxx:964
#define MIN_WORD_LEN
Definition: adoptimize.cxx:82
GBDATA ** gbds
Definition: adoptimize.cxx:36
GB_ERROR GB_await_error()
Definition: arb_msg.cxx:342
char * GB_memdup(const char *source, size_t len)
Definition: adstring.cxx:56
static int diff(int v1, int v2, int v3, int v4, int st, int en)
Definition: ClustalV.cxx:534
GBDATA * GB_create_container(GBDATA *father, const char *key)
Definition: arbdb.cxx:1829
gb_flag_types flags
Definition: gb_data.h:134
#define TEST_EXPECT(cond)
Definition: test_unit.h:1328
const unsigned char * cu_str
Definition: adoptimize.cxx:31
GB_ERROR GB_finish_stream_save(ArbDBWriter *&writer)
static int gbdByKey_cnt
Definition: adoptimize.cxx:33
static cu_str memstr(cu_str stringStart, int stringStartLen, cu_str inString, int inStringLen)
#define LEN_SHIFT
Definition: adoptimize.cxx:316
GBDATA * GB_create(GBDATA *father, const char *key, GB_TYPES type)
Definition: arbdb.cxx:1781
#define MIN_SHORTLEN
Definition: adoptimize.cxx:323
#define MAX_DIFFER
Definition: adoptimize.cxx:86
#define DICT_STRING_INCR
Definition: adoptimize.cxx:90
static DictTree removeSubsequentString(DictTree *tree_pntr, cu_str buffer, int len, int max_occur)
static DictTree new_dtree(cu_str text, long len, long *memcount)
static void sort_dict_offsets(GB_DICTIONARY *dict)
bool is_a_string() const
Definition: gb_data.h:144
GB_ERROR GB_save_as(GBDATA *gbd, const char *path, const char *savetype)
SingleDictTree * single
Definition: adoptimize.cxx:44
static DictTree add_dtree_to_dtree(DictTree toAdd, DictTree to, long *memcount)
bool is_container() const
Definition: gb_data.h:147
#define TEST_REJECT(cond)
Definition: test_unit.h:1330
#define TEST_REJECT_NULL(n)
Definition: test_unit.h:1325
static void downheap2(int *heap, GB_DICTIONARY *dict, int me, int num)
static void error(const char *msg)
Definition: mkptypes.cxx:96
GB_ERROR gb_load_dictionary_data(GBDATA *gb_main, const char *key, char **dict_data, long *size)
Definition: adsystem.cxx:33
GB_ERROR gb_save_dictionary_data(GBDATA *gb_main, const char *key, const char *dict, int size)
Definition: adsystem.cxx:168
void GB_flush_cache(GBDATA *gbd)
Definition: adcache.cxx:226
static char * gb_uncompress_by_dictionary_internal(GB_DICTIONARY *dict, GB_CSTR s_source, const size_t size, bool append_zero, size_t *new_size)
Definition: adoptimize.cxx:505
unsigned int extern_data
Definition: gb_data.h:84
#define swap(i1, i2)
GBCONTAINER * as_container() const
Definition: gb_data.h:155
#define TEST_EXPECT_ZERO_OR_SHOW_ERRNO(iocond)
Definition: test_unit.h:1090
#define TEST_EXPECT_LESS(val, ref)
Definition: test_unit.h:1304
void * gbm_get_mem(size_t size, long index)
Definition: gb_memory.h:130
cu_str get_data_n_size(GBDATA *gbd, size_t *size)
Definition: adoptimize.cxx:94
static DictTree cut_useless_words(DictTree tree, int deep, long *removed)
#define COMPRESSIBLE(type)
Definition: adoptimize.cxx:69
static SearchTree * tree[SEARCH_PATTERNS]
Definition: ED4_search.cxx:629
DictTree son[256]
Definition: adoptimize.cxx:55
int GB_read_flag(GBDATA *gbd)
Definition: arbdb.cxx:2796
int compression_mask
Definition: gb_key.h:38
#define LONGLEN_DECR
Definition: adoptimize.cxx:329
int GB_MEMCMP(const void *vm1, const void *vm2, long size)
Definition: adoptimize.cxx:424
char * key
Definition: gb_key.h:29
GBDATA * gb_main() const
Definition: gb_main.h:179
Definition: arbdb.h:86
static O_gbdByKey * g_b_opti_createGbdByKey(GB_MAIN_TYPE *Main)
Definition: adoptimize.cxx:138
GB_CUINT4 * GB_read_ints_pntr(GBDATA *gbd)
Definition: arbdb.cxx:979
GB_ERROR GB_optimize(GBDATA *gb_main)
#define MAX_LONGLEN
Definition: adoptimize.cxx:326
static int COUNT(DictTree tree)
static void g_b_opti_scanGbdByKey(GB_MAIN_TYPE *Main, GBDATA *gbd, O_gbdByKey *gbk)
Definition: adoptimize.cxx:119
GB_ERROR GB_create_index(GBDATA *gbd, const char *key, GB_CASE case_sens, long estimated_size) __ATTR__USERESULT
Definition: adindex.cxx:117
#define nint(x)
char * data()
Definition: gb_data.h:219
unsigned char * u_str
Definition: adoptimize.cxx:30
GB_ERROR GB_set_temporary(GBDATA *gbd) __ATTR__USERESULT
Definition: arbdb.cxx:2282
Definition: arbdb.h:72
#define INDEX_SHIFT
Definition: adoptimize.cxx:317
#define MAX_SHORT_INDEX
Definition: adoptimize.cxx:334
GB_ULONG GB_get_usable_memory(void)
Definition: adsocket.cxx:865
static void copy(double **i, double **j)
Definition: trnsprob.cxx:32
GBQUARK GB_KEY_QUARK(GBDATA *gbd)
Definition: gb_key.h:48
#define MAX_SHORTLEN
Definition: adoptimize.cxx:324
unsigned_char ch
Definition: adoptimize.cxx:60
FullDictTree * full
Definition: adoptimize.cxx:43
TYPE * ARB_calloc(size_t nelem)
Definition: arb_mem.h:81
#define gb_assert(cond)
Definition: arbdbt.h:11
void GB_write_flag(GBDATA *gbd, long flag)
Definition: arbdb.cxx:2773
GB_UNDO_TYPE GB_get_requested_undo_type(GBDATA *gb_main)
Definition: adindex.cxx:749
static DictTree remove_word_from_dtree(DictTree tree, cu_str wordStart, int wordLen, u_str resultBuffer, int *resultLen, long *resultFrequency, long *removed)
size_t uncompressed_size() const
Definition: gb_compress.h:48
#define MAX_WORD_LEN
Definition: adoptimize.cxx:83
void * exists
Definition: adoptimize.cxx:45
#define TEST_EXPECT_FILES_EQUAL(f1, f2)
Definition: test_unit.h:1422
GB_ERROR GB_write_ints(GBDATA *gbd, const GB_UINT4 *i, long size)
Definition: arbdb.cxx:1439
char * gb_uncompress_by_dictionary(GBDATA *gbd, GB_CSTR s_source, size_t size, size_t *new_size)
Definition: adoptimize.cxx:578
static void gb_free_dictionary(GB_DICTIONARY *&dict)
static DictTree cut_dtree(DictTree tree, int cut_count, long *memcount, long *leafcount)
GB_ERROR GB_copy_dropMarksAndTempstate(GBDATA *dest, GBDATA *source)
Definition: arbdb.cxx:2163
GBDATA * GBT_first_species(GBDATA *gb_main)
Definition: aditem.cxx:124
char * gb_compress_by_dictionary(GB_DICTIONARY *dict, GB_CSTR s_source, size_t size, size_t *msize, int last_flag, int search_backward, int search_forward)
Definition: adoptimize.cxx:591
#define testCounts(tree)
const char * GB_get_db_path(GBDATA *gbd)
Definition: adTest.cxx:14
#define TEST_EXPECT_NO_ERROR(call)
Definition: test_unit.h:1118
#define GB_COMPRESSION_TAGS_SIZE_MAX
Definition: gb_compress.h:18
gb_Key * keys
Definition: gb_main.h:134
GBENTRY * as_entry() const
Definition: gb_data.h:150
#define WORD_HELPFUL(wordlen, occurrences)
Definition: adoptimize.cxx:72
GBDATA * GBCONTAINER_ELEM(GBCONTAINER *gbc, int idx)
Definition: gb_header.h:57
GB_ERROR GB_write_floats(GBDATA *gbd, const float *f, long size)
Definition: arbdb.cxx:1457
GBDATA * GBT_next_species(GBDATA *gb_species)
Definition: aditem.cxx:128
#define NULp
Definition: cxxforward.h:116
bool GB_is_regularfile(const char *path)
Definition: arb_file.cxx:76
gb_data_list d
Definition: gb_data.h:246
size_t size() const
Definition: gb_data.h:214
GBDATA * GBT_find_species(GBDATA *gb_main, const char *name)
Definition: aditem.cxx:139
#define TEST_EXPECT_ERROR_CONTAINS(call, part)
Definition: test_unit.h:1114
#define MAX_COMPR_WORD_LEN
Definition: adoptimize.cxx:332
static void free_dtree(DictTree tree)
int keycnt
Definition: gb_main.h:131
int textlen
Definition: gb_dict.h:18
#define offset(field)
Definition: GLwDrawA.c:73
GB_DICTIONARY * gb_get_dictionary(GB_MAIN_TYPE *Main, GBQUARK key)
Definition: adcompr.cxx:857
#define TEST_EXPECT_TEXTFILE_DIFFLINES(fgot, fwant, diff)
Definition: test_unit.h:1416
#define BITMASK(bits)
Definition: adoptimize.cxx:320
GB_TYPES
Definition: arbdb.h:62
static void g_b_opti_freeGbdByKey(O_gbdByKey *gbk)
Definition: adoptimize.cxx:171
GB_ERROR GB_request_undo_type(GBDATA *gb_main, GB_UNDO_TYPE type) __ATTR__USERESULT_TODO
Definition: adindex.cxx:721
GBDATA * GB_nextChild(GBDATA *child)
Definition: adquery.cxx:326
#define INDEX_BITS
Definition: adoptimize.cxx:313
static long calcCounts(DictTree tree)
GB_CFLOAT * GB_read_floats_pntr(GBDATA *gbd)
Definition: arbdb.cxx:1019
GB_transaction ta(gb_var)
static long min(long a, long b)
Definition: adoptimize.cxx:113
GB_NINT * resort
Definition: gb_dict.h:21
GB_CSTR GB_read_char_pntr(GBDATA *gbd)
Definition: arbdb.cxx:904
GBDATA * gb_main
Definition: adname.cxx:32
void gbm_free_mem(void *block, size_t size, long index)
Definition: gb_memory.h:131
Definition: arbdb.h:71
GBDATA * GBT_get_SAI_data(GBDATA *gb_main)
Definition: aditem.cxx:154
#define INCR_DIFFER
Definition: adoptimize.cxx:88
GBDATA * GB_search(GBDATA *gbd, const char *fieldpath, GB_TYPES create)
Definition: adquery.cxx:531
int nheader
Definition: gb_data.h:102
static int count_dtree_leafs(DictTree tree, int deep, int *maxdeep)
size_t length
#define TEST_EXPECT_TEXTFILES_EQUAL(fgot, fwant)
Definition: test_unit.h:1424
#define SHORTLEN_DECR
Definition: adoptimize.cxx:328
const char * GB_CSTR
Definition: arbdb_base.h:25
int GBQUARK
Definition: arbdb_base.h:30
#define TEST_EXPECT_EQUAL(expr, want)
Definition: test_unit.h:1294
int GB_NINT
Definition: gb_dict.h:14
GBDATA * GB_entry(GBDATA *father, const char *key)
Definition: adquery.cxx:334
void inc_and_check_user_abort(GB_ERROR &error)
Definition: arb_progress.h:332
unsigned int compressed_data
Definition: gb_data.h:70
char * GBS_global_string_copy(const char *templat,...)
Definition: arb_msg.cxx:194
void GB_close(GBDATA *gbd)
Definition: arbdb.cxx:655
GBDATA * GBT_get_species_data(GBDATA *gb_main)
Definition: aditem.cxx:105
GB_write_int const char s
Definition: AW_awar.cxx:154