Fossil SCM

New --compress option to "rebuild" causes more agressive delta compression which can result in a 30% or better size reduction in the database file, with corresponding speedup of cloning.

drh 2011-03-01 13:20 trunk
Commit 301700a224211fac31f9a58d7e1ebdec32e95661
+13 -7
--- src/content.c
+++ src/content.c
@@ -746,21 +746,25 @@
746746
**
747747
** If srcid is a delta that depends on rid, then srcid is
748748
** converted to undeltaed text.
749749
**
750750
** If either rid or srcid contain less than 50 bytes, or if the
751
-** resulting delta does not achieve a compression of at least 25% on
752
-** its own the rid is left untouched.
751
+** resulting delta does not achieve a compression of at least 25%
752
+** the rid is left untouched.
753
+**
754
+** Return 1 if a delta is made and 0 if no delta occurs.
753755
*/
754
-void content_deltify(int rid, int srcid, int force){
756
+int content_deltify(int rid, int srcid, int force){
755757
int s;
756758
Blob data, src, delta;
757759
Stmt s1, s2;
760
+ int rc = 0;
761
+
758762
if( srcid==rid ) return;
759763
if( !force && findSrcid(rid)>0 ) return;
760764
if( content_is_private(srcid) && !content_is_private(rid) ){
761
- return;
765
+ return 0;
762766
}
763767
s = srcid;
764768
while( (s = findSrcid(s))>0 ){
765769
if( s==rid ){
766770
content_undelta(srcid);
@@ -768,20 +772,20 @@
768772
}
769773
}
770774
content_get(srcid, &src);
771775
if( blob_size(&src)<50 ){
772776
blob_reset(&src);
773
- return;
777
+ return 0;
774778
}
775779
content_get(rid, &data);
776780
if( blob_size(&data)<50 ){
777781
blob_reset(&src);
778782
blob_reset(&data);
779
- return;
783
+ return 0;
780784
}
781785
blob_delta_create(&src, &data, &delta);
782
- if( blob_size(&delta) < blob_size(&data)*0.75 ){
786
+ if( blob_size(&delta) <= blob_size(&data)*0.75 ){
783787
blob_compress(&delta, &delta);
784788
db_prepare(&s1, "UPDATE blob SET content=:data WHERE rid=%d", rid);
785789
db_prepare(&s2, "REPLACE INTO delta(rid,srcid)VALUES(%d,%d)", rid, srcid);
786790
db_bind_blob(&s1, ":data", &delta);
787791
db_begin_transaction();
@@ -789,14 +793,16 @@
789793
db_exec(&s2);
790794
db_end_transaction(0);
791795
db_finalize(&s1);
792796
db_finalize(&s2);
793797
verify_before_commit(rid);
798
+ rc = 1;
794799
}
795800
blob_reset(&src);
796801
blob_reset(&data);
797802
blob_reset(&delta);
803
+ return rc;
798804
}
799805
800806
/*
801807
** COMMAND: test-content-deltify
802808
**
803809
--- src/content.c
+++ src/content.c
@@ -746,21 +746,25 @@
746 **
747 ** If srcid is a delta that depends on rid, then srcid is
748 ** converted to undeltaed text.
749 **
750 ** If either rid or srcid contain less than 50 bytes, or if the
751 ** resulting delta does not achieve a compression of at least 25% on
752 ** its own the rid is left untouched.
 
 
753 */
754 void content_deltify(int rid, int srcid, int force){
755 int s;
756 Blob data, src, delta;
757 Stmt s1, s2;
 
 
758 if( srcid==rid ) return;
759 if( !force && findSrcid(rid)>0 ) return;
760 if( content_is_private(srcid) && !content_is_private(rid) ){
761 return;
762 }
763 s = srcid;
764 while( (s = findSrcid(s))>0 ){
765 if( s==rid ){
766 content_undelta(srcid);
@@ -768,20 +772,20 @@
768 }
769 }
770 content_get(srcid, &src);
771 if( blob_size(&src)<50 ){
772 blob_reset(&src);
773 return;
774 }
775 content_get(rid, &data);
776 if( blob_size(&data)<50 ){
777 blob_reset(&src);
778 blob_reset(&data);
779 return;
780 }
781 blob_delta_create(&src, &data, &delta);
782 if( blob_size(&delta) < blob_size(&data)*0.75 ){
783 blob_compress(&delta, &delta);
784 db_prepare(&s1, "UPDATE blob SET content=:data WHERE rid=%d", rid);
785 db_prepare(&s2, "REPLACE INTO delta(rid,srcid)VALUES(%d,%d)", rid, srcid);
786 db_bind_blob(&s1, ":data", &delta);
787 db_begin_transaction();
@@ -789,14 +793,16 @@
789 db_exec(&s2);
790 db_end_transaction(0);
791 db_finalize(&s1);
792 db_finalize(&s2);
793 verify_before_commit(rid);
 
794 }
795 blob_reset(&src);
796 blob_reset(&data);
797 blob_reset(&delta);
 
798 }
799
800 /*
801 ** COMMAND: test-content-deltify
802 **
803
--- src/content.c
+++ src/content.c
@@ -746,21 +746,25 @@
746 **
747 ** If srcid is a delta that depends on rid, then srcid is
748 ** converted to undeltaed text.
749 **
750 ** If either rid or srcid contain less than 50 bytes, or if the
751 ** resulting delta does not achieve a compression of at least 25%
752 ** the rid is left untouched.
753 **
754 ** Return 1 if a delta is made and 0 if no delta occurs.
755 */
756 int content_deltify(int rid, int srcid, int force){
757 int s;
758 Blob data, src, delta;
759 Stmt s1, s2;
760 int rc = 0;
761
762 if( srcid==rid ) return;
763 if( !force && findSrcid(rid)>0 ) return;
764 if( content_is_private(srcid) && !content_is_private(rid) ){
765 return 0;
766 }
767 s = srcid;
768 while( (s = findSrcid(s))>0 ){
769 if( s==rid ){
770 content_undelta(srcid);
@@ -768,20 +772,20 @@
772 }
773 }
774 content_get(srcid, &src);
775 if( blob_size(&src)<50 ){
776 blob_reset(&src);
777 return 0;
778 }
779 content_get(rid, &data);
780 if( blob_size(&data)<50 ){
781 blob_reset(&src);
782 blob_reset(&data);
783 return 0;
784 }
785 blob_delta_create(&src, &data, &delta);
786 if( blob_size(&delta) <= blob_size(&data)*0.75 ){
787 blob_compress(&delta, &delta);
788 db_prepare(&s1, "UPDATE blob SET content=:data WHERE rid=%d", rid);
789 db_prepare(&s2, "REPLACE INTO delta(rid,srcid)VALUES(%d,%d)", rid, srcid);
790 db_bind_blob(&s1, ":data", &delta);
791 db_begin_transaction();
@@ -789,14 +793,16 @@
793 db_exec(&s2);
794 db_end_transaction(0);
795 db_finalize(&s1);
796 db_finalize(&s2);
797 verify_before_commit(rid);
798 rc = 1;
799 }
800 blob_reset(&src);
801 blob_reset(&data);
802 blob_reset(&delta);
803 return rc;
804 }
805
806 /*
807 ** COMMAND: test-content-deltify
808 **
809
+2 -1
--- src/manifest.c
+++ src/manifest.c
@@ -1253,11 +1253,12 @@
12531253
manifest_destroy(*ppOther);
12541254
return;
12551255
}
12561256
12571257
/* Try to make the parent manifest a delta from the child, if that
1258
- ** is an appropriate thing to do.
1258
+ ** is an appropriate thing to do. For a new baseline, make the
1259
+ ** previoius baseline a delta from the current baseline.
12591260
*/
12601261
if( (pParent->zBaseline==0)==(pChild->zBaseline==0) ){
12611262
content_deltify(pid, cid, 0);
12621263
}else if( pChild->zBaseline==0 && pParent->zBaseline!=0 ){
12631264
content_deltify(pParent->pBaseline->rid, cid, 0);
12641265
--- src/manifest.c
+++ src/manifest.c
@@ -1253,11 +1253,12 @@
1253 manifest_destroy(*ppOther);
1254 return;
1255 }
1256
1257 /* Try to make the parent manifest a delta from the child, if that
1258 ** is an appropriate thing to do.
 
1259 */
1260 if( (pParent->zBaseline==0)==(pChild->zBaseline==0) ){
1261 content_deltify(pid, cid, 0);
1262 }else if( pChild->zBaseline==0 && pParent->zBaseline!=0 ){
1263 content_deltify(pParent->pBaseline->rid, cid, 0);
1264
--- src/manifest.c
+++ src/manifest.c
@@ -1253,11 +1253,12 @@
1253 manifest_destroy(*ppOther);
1254 return;
1255 }
1256
1257 /* Try to make the parent manifest a delta from the child, if that
1258 ** is an appropriate thing to do. For a new baseline, make the
1259 ** previoius baseline a delta from the current baseline.
1260 */
1261 if( (pParent->zBaseline==0)==(pChild->zBaseline==0) ){
1262 content_deltify(pid, cid, 0);
1263 }else if( pChild->zBaseline==0 && pParent->zBaseline!=0 ){
1264 content_deltify(pParent->pBaseline->rid, cid, 0);
1265
--- src/rebuild.c
+++ src/rebuild.c
@@ -339,10 +339,67 @@
339339
if(!g.fQuiet && ttyOutput ){
340340
printf("\n");
341341
}
342342
return errCnt;
343343
}
344
+
345
+/*
346
+** Attempt to convert more full-text blobs into delta-blobs for
347
+** storage efficiency.
348
+*/
349
+static void extra_deltification(void){
350
+ Stmt q;
351
+ int topid, previd, rid;
352
+ int prevfnid, fnid;
353
+ db_begin_transaction();
354
+ db_prepare(&q,
355
+ "SELECT rid FROM event, blob"
356
+ " WHERE blob.rid=event.objid"
357
+ " AND event.type='ci'"
358
+ " AND NOT EXISTS(SELECT 1 FROM delta WHERE rid=blob.rid)"
359
+ " ORDER BY event.mtime DESC"
360
+ );
361
+ topid = previd = 0;
362
+ while( db_step(&q)==SQLITE_ROW ){
363
+ rid = db_column_int(&q, 0);
364
+ if( topid==0 ){
365
+ topid = previd = rid;
366
+ }else{
367
+ if( content_deltify(rid, previd, 0)==0 && previd!=topid ){
368
+ content_deltify(rid, topid, 0);
369
+ }
370
+ previd = rid;
371
+ }
372
+ }
373
+ db_finalize(&q);
374
+
375
+ db_prepare(&q,
376
+ "SELECT blob.rid, mlink.fnid FROM blob, mlink, plink"
377
+ " WHERE NOT EXISTS(SELECT 1 FROM delta WHERE rid=blob.rid)"
378
+ " AND mlink.fid=blob.rid"
379
+ " AND mlink.mid=plink.cid"
380
+ " AND plink.cid=mlink.mid"
381
+ " ORDER BY mlink.fnid, plink.mtime DESC"
382
+ );
383
+ prevfnid = 0;
384
+ while( db_step(&q)==SQLITE_ROW ){
385
+ rid = db_column_int(&q, 0);
386
+ fnid = db_column_int(&q, 1);
387
+ if( prevfnid!=fnid ){
388
+ prevfnid = fnid;
389
+ topid = previd = rid;
390
+ }else{
391
+ if( content_deltify(rid, previd, 0)==0 && previd!=topid ){
392
+ content_deltify(rid, topid, 0);
393
+ }
394
+ previd = rid;
395
+ }
396
+ }
397
+ db_finalize(&q);
398
+
399
+ db_end_transaction(0);
400
+}
344401
345402
/*
346403
** COMMAND: rebuild
347404
**
348405
** Usage: %fossil rebuild ?REPOSITORY?
@@ -357,10 +414,11 @@
357414
** --force Force the rebuild to complete even if errors are seen
358415
** --randomize Scan artifacts in a random order
359416
** --cluster Compute clusters for unclustered artifacts
360417
** --pagesize N Set the database pagesize to N. (512..65536 and power of 2)
361418
** --wal Set Write-Ahead-Log journalling mode on the database
419
+** --compress Strive to make the database as small as possible
362420
** --vacuum Run VACUUM on the database after rebuilding
363421
*/
364422
void rebuild_database(void){
365423
int forceFlag;
366424
int randomizeFlag;
@@ -369,16 +427,18 @@
369427
int doClustering;
370428
const char *zPagesize;
371429
int newPagesize = 0;
372430
int activateWal;
373431
int runVacuum;
432
+ int runCompress;
374433
375434
omitVerify = find_option("noverify",0,0)!=0;
376435
forceFlag = find_option("force","f",0)!=0;
377436
randomizeFlag = find_option("randomize", 0, 0)!=0;
378437
doClustering = find_option("cluster", 0, 0)!=0;
379438
runVacuum = find_option("vacuum",0,0)!=0;
439
+ runCompress = find_option("compress",0,0)!=0;
380440
zPagesize = find_option("pagesize",0,1);
381441
if( zPagesize ){
382442
newPagesize = atoi(zPagesize);
383443
if( newPagesize<512 || newPagesize>65536
384444
|| (newPagesize&(newPagesize-1))!=0
@@ -408,12 +468,18 @@
408468
if( errCnt && !forceFlag ){
409469
printf("%d errors. Rolling back changes. Use --force to force a commit.\n",
410470
errCnt);
411471
db_end_transaction(1);
412472
}else{
473
+ if( runCompress ){
474
+ printf("Extra delta compression... "); fflush(stdout);
475
+ extra_deltification();
476
+ runVacuum = 1;
477
+ }
413478
if( omitVerify ) verify_cancel();
414479
db_end_transaction(0);
480
+ if( runCompress ) printf("done\n");
415481
db_close(0);
416482
db_open_repository(g.zRepositoryName);
417483
if( newPagesize ){
418484
db_multi_exec("PRAGMA page_size=%d", newPagesize);
419485
runVacuum = 1;
420486
--- src/rebuild.c
+++ src/rebuild.c
@@ -339,10 +339,67 @@
339 if(!g.fQuiet && ttyOutput ){
340 printf("\n");
341 }
342 return errCnt;
343 }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
344
345 /*
346 ** COMMAND: rebuild
347 **
348 ** Usage: %fossil rebuild ?REPOSITORY?
@@ -357,10 +414,11 @@
357 ** --force Force the rebuild to complete even if errors are seen
358 ** --randomize Scan artifacts in a random order
359 ** --cluster Compute clusters for unclustered artifacts
360 ** --pagesize N Set the database pagesize to N. (512..65536 and power of 2)
361 ** --wal Set Write-Ahead-Log journalling mode on the database
 
362 ** --vacuum Run VACUUM on the database after rebuilding
363 */
364 void rebuild_database(void){
365 int forceFlag;
366 int randomizeFlag;
@@ -369,16 +427,18 @@
369 int doClustering;
370 const char *zPagesize;
371 int newPagesize = 0;
372 int activateWal;
373 int runVacuum;
 
374
375 omitVerify = find_option("noverify",0,0)!=0;
376 forceFlag = find_option("force","f",0)!=0;
377 randomizeFlag = find_option("randomize", 0, 0)!=0;
378 doClustering = find_option("cluster", 0, 0)!=0;
379 runVacuum = find_option("vacuum",0,0)!=0;
 
380 zPagesize = find_option("pagesize",0,1);
381 if( zPagesize ){
382 newPagesize = atoi(zPagesize);
383 if( newPagesize<512 || newPagesize>65536
384 || (newPagesize&(newPagesize-1))!=0
@@ -408,12 +468,18 @@
408 if( errCnt && !forceFlag ){
409 printf("%d errors. Rolling back changes. Use --force to force a commit.\n",
410 errCnt);
411 db_end_transaction(1);
412 }else{
 
 
 
 
 
413 if( omitVerify ) verify_cancel();
414 db_end_transaction(0);
 
415 db_close(0);
416 db_open_repository(g.zRepositoryName);
417 if( newPagesize ){
418 db_multi_exec("PRAGMA page_size=%d", newPagesize);
419 runVacuum = 1;
420
--- src/rebuild.c
+++ src/rebuild.c
@@ -339,10 +339,67 @@
339 if(!g.fQuiet && ttyOutput ){
340 printf("\n");
341 }
342 return errCnt;
343 }
344
345 /*
346 ** Attempt to convert more full-text blobs into delta-blobs for
347 ** storage efficiency.
348 */
349 static void extra_deltification(void){
350 Stmt q;
351 int topid, previd, rid;
352 int prevfnid, fnid;
353 db_begin_transaction();
354 db_prepare(&q,
355 "SELECT rid FROM event, blob"
356 " WHERE blob.rid=event.objid"
357 " AND event.type='ci'"
358 " AND NOT EXISTS(SELECT 1 FROM delta WHERE rid=blob.rid)"
359 " ORDER BY event.mtime DESC"
360 );
361 topid = previd = 0;
362 while( db_step(&q)==SQLITE_ROW ){
363 rid = db_column_int(&q, 0);
364 if( topid==0 ){
365 topid = previd = rid;
366 }else{
367 if( content_deltify(rid, previd, 0)==0 && previd!=topid ){
368 content_deltify(rid, topid, 0);
369 }
370 previd = rid;
371 }
372 }
373 db_finalize(&q);
374
375 db_prepare(&q,
376 "SELECT blob.rid, mlink.fnid FROM blob, mlink, plink"
377 " WHERE NOT EXISTS(SELECT 1 FROM delta WHERE rid=blob.rid)"
378 " AND mlink.fid=blob.rid"
379 " AND mlink.mid=plink.cid"
380 " AND plink.cid=mlink.mid"
381 " ORDER BY mlink.fnid, plink.mtime DESC"
382 );
383 prevfnid = 0;
384 while( db_step(&q)==SQLITE_ROW ){
385 rid = db_column_int(&q, 0);
386 fnid = db_column_int(&q, 1);
387 if( prevfnid!=fnid ){
388 prevfnid = fnid;
389 topid = previd = rid;
390 }else{
391 if( content_deltify(rid, previd, 0)==0 && previd!=topid ){
392 content_deltify(rid, topid, 0);
393 }
394 previd = rid;
395 }
396 }
397 db_finalize(&q);
398
399 db_end_transaction(0);
400 }
401
402 /*
403 ** COMMAND: rebuild
404 **
405 ** Usage: %fossil rebuild ?REPOSITORY?
@@ -357,10 +414,11 @@
414 ** --force Force the rebuild to complete even if errors are seen
415 ** --randomize Scan artifacts in a random order
416 ** --cluster Compute clusters for unclustered artifacts
417 ** --pagesize N Set the database pagesize to N. (512..65536 and power of 2)
418 ** --wal Set Write-Ahead-Log journalling mode on the database
419 ** --compress Strive to make the database as small as possible
420 ** --vacuum Run VACUUM on the database after rebuilding
421 */
422 void rebuild_database(void){
423 int forceFlag;
424 int randomizeFlag;
@@ -369,16 +427,18 @@
427 int doClustering;
428 const char *zPagesize;
429 int newPagesize = 0;
430 int activateWal;
431 int runVacuum;
432 int runCompress;
433
434 omitVerify = find_option("noverify",0,0)!=0;
435 forceFlag = find_option("force","f",0)!=0;
436 randomizeFlag = find_option("randomize", 0, 0)!=0;
437 doClustering = find_option("cluster", 0, 0)!=0;
438 runVacuum = find_option("vacuum",0,0)!=0;
439 runCompress = find_option("compress",0,0)!=0;
440 zPagesize = find_option("pagesize",0,1);
441 if( zPagesize ){
442 newPagesize = atoi(zPagesize);
443 if( newPagesize<512 || newPagesize>65536
444 || (newPagesize&(newPagesize-1))!=0
@@ -408,12 +468,18 @@
468 if( errCnt && !forceFlag ){
469 printf("%d errors. Rolling back changes. Use --force to force a commit.\n",
470 errCnt);
471 db_end_transaction(1);
472 }else{
473 if( runCompress ){
474 printf("Extra delta compression... "); fflush(stdout);
475 extra_deltification();
476 runVacuum = 1;
477 }
478 if( omitVerify ) verify_cancel();
479 db_end_transaction(0);
480 if( runCompress ) printf("done\n");
481 db_close(0);
482 db_open_repository(g.zRepositoryName);
483 if( newPagesize ){
484 db_multi_exec("PRAGMA page_size=%d", newPagesize);
485 runVacuum = 1;
486

Keyboard Shortcuts

Open search /
Next entry (timeline) j
Previous entry (timeline) k
Open focused entry Enter
Show this help ?
Toggle theme Top nav button