воскресенье, 1 ноября 2009 г.

The Zlib Compression Patch for the FTS3 Extension

I did try to add the zlib compression to the FTS3 extension for document content and metadata. The result you can see below. The code is only prototype but it's simple.

The functions compress() and uncompress() are published by DRH here: Re: [sqlite] Compress function. I did wrap these to the compression extension.

The original FTS3:
$ sqlite3
SQLite version 3.6.19
Enter ".help" for instructions
Enter SQL statements terminated with a ";"
sqlite> CREATE VIRTUAL TABLE t USING fts3(content, TOKENIZE icu ru_RU);
sqlite> insert into t (content) values('
...> - работай, как будто тебе не надо денег,
...> - люби, как будто тебе никто никогда не причинял боль,
...> - танцуй, как будто никто не смотрит,
...> - пой, как будто никто не слышит,
...> - живи, как будто на земле рай');
sqlite>
sqlite> insert into t (content) values('
...> Среди миров
...>
...> Среди миров, в мерцании светил
...> Одной Звезды я повторяю имя...
...> Не потому, чтоб я Её любил,
...> А потому, что я томлюсь с другими.
...>
...> И если мне сомненье тяжело,
...> Я у Неё одной молю ответа,
...> Не потому, что от Неё светло,
...> А потому, что с Ней не надо света.
...>
...> Иннокентий Анненский
...> ');
sqlite>
sqlite> .schema
CREATE VIRTUAL TABLE t USING fts3(content, TOKENIZE icu ru_RU);
CREATE TABLE t_content(  docid INTEGER PRIMARY KEY,c0content);
CREATE TABLE t_segdir(  level integer,  idx integer,  start_block integer,  leaves_end_block integer,  end_block integer,  root blob,  primary key(level, idx));
CREATE TABLE t_segments(  blockid INTEGER PRIMARY KEY,  block blob);
sqlite> select length(root) from t_segdir;
308
493
sqlite> select length(c0content) from t_content;
199
282
sqlite> select snippet(t, '[', ']', '%%') from t where t match 'НА*';
%% как будто тебе не [надо] денег,
- люби, как будто %%
%% потому, что с Ней не [надо] света.

Иннокентий %%


The patched FTS3 with zlib-compression:
$ sqlite3                                                                                                                             
SQLite version 3.6.19                                                                                                                                        
Enter ".help" for instructions                                                                                                                               
Enter SQL statements terminated with a ";"                                                                                                                   
sqlite> CREATE VIRTUAL TABLE t USING fts3(content, TOKENIZE icu ru_RU);                                                                                      
sqlite> insert into t (content) values('
...> - работай, как будто тебе не надо денег,
...> - люби, как будто тебе никто никогда не причинял боль,
...> - танцуй, как будто никто не смотрит,
...> - пой, как будто никто не слышит,
...> - живи, как будто на земле рай');
sqlite>
sqlite> insert into t (content) values('
...> Среди миров
...>
...> Среди миров, в мерцании светил
...> Одной Звезды я повторяю имя...
...> Не потому, чтоб я Её любил,
...> А потому, что я томлюсь с другими.
...>
...> И если мне сомненье тяжело,
...> Я у Неё одной молю ответа,
...> Не потому, что от Неё светло,
...> А потому, что с Ней не надо света.
...>
...> Иннокентий Анненский
...> ');
sqlite>
sqlite> .schema
CREATE VIRTUAL TABLE t USING fts3(content, TOKENIZE icu ru_RU);
CREATE TABLE t_content(  docid INTEGER PRIMARY KEY,c0content blob);
CREATE TABLE t_segdir(  level integer,  idx integer,  start_block integer,  leaves_end_block integer,  end_block integer,  root blob,  primary key(level, idx));
CREATE TABLE t_segments(  blockid INTEGER PRIMARY KEY,  block blob);
CREATE TRIGGER t_content_i after insert on main.t_content begin update t_content set c0content=compress(c0content) where docid=NEW.docid; end;
sqlite> select length(root) from t_segdir;
234
353
sqlite> select length(c0content) from t_content;
165
250
sqlite> select snippet(t, '[', ']', '%%') from t where t match 'НА*';
%% как будто тебе не [надо] денег,
- люби, как будто %%
%% потому, что с Ней не [надо] света.

Иннокентий %%


The patch for the FTS3 extension with compression of the document and the metadata:

--- sqlite3-3.6.19.orig/ext/fts3/fts3.c
+++ sqlite3-3.6.19/ext/fts3/fts3.c
@@ -1856,9 +1856,9 @@
/* BLOCK_DELETE_ALL */ "delete from %_segments",

/* SEGDIR_MAX_INDEX */ "select max(idx) from %_segdir where level = ?",
-  /* SEGDIR_SET */ "insert into %_segdir values (?, ?, ?, ?, ?, ?)",
+  /* SEGDIR_SET */ "insert into %_segdir values (?, ?, ?, ?, ?, compress(?))",
/* SEGDIR_SELECT_LEVEL */
-  "select start_block, leaves_end_block, root from %_segdir "
+  "select start_block, leaves_end_block, uncompress(root) from %_segdir "
" where level = ? order by idx",
/* SEGDIR_SPAN */
"select min(start_block), max(end_block) from %_segdir "
@@ -1869,10 +1869,10 @@
** statements must match.
*/
/* SEGDIR_SELECT_SEGMENT */
-  "select start_block, leaves_end_block, root from %_segdir "
+  "select start_block, leaves_end_block, uncompress(root) from %_segdir "
" where level = ? and idx = ?",
/* SEGDIR_SELECT_ALL */
-  "select start_block, leaves_end_block, root from %_segdir "
+  "select start_block, leaves_end_block, uncompress(root) from %_segdir "
" order by level desc, idx asc",
/* SEGDIR_DELETE_ALL */ "delete from %_segdir",
/* SEGDIR_COUNT */ "select count(*), ifnull(max(level),0) from %_segdir",
@@ -1958,8 +1958,10 @@
append(&sb, "insert into %_content (docid, ");
appendList(&sb, v->nColumn, v->azContentColumn);
append(&sb, ") values (?");
+//  append(&sb, ") values (compress(?)");
for(i=0; inColumn; ++i)
append(&sb, ", ?");
+//    append(&sb, ", compress(?)");
append(&sb, ")");
return stringBufferData(&sb);
}
@@ -1969,9 +1971,19 @@
*/
static const char *contentSelectStatement(fulltext_vtab *v){
StringBuffer sb;
+  int i;
+
initStringBuffer(&sb);
append(&sb, "SELECT ");
-  appendList(&sb, v->nColumn, v->azContentColumn);
+//  appendList(&sb, v->nColumn, v->azContentColumn);
+  for(i=0; inColumn; ++i) {
+    if( i>0 ){
+      append(&sb, ", ");
+    }
+    append(&sb, "uncompress(");
+    append(&sb, v->azContentColumn[i]);
+    append(&sb, ")");
+  }
append(&sb, " FROM %_content WHERE docid = ?");
return stringBufferData(&sb);
}
@@ -1991,7 +2003,8 @@
append(&sb, ", ");
}
append(&sb, v->azContentColumn[i]);
-    append(&sb, " = ?");
+//    append(&sb, " = ?");
+    append(&sb, " = compress(?)");
}
append(&sb, " where docid = ?");
return stringBufferData(&sb);
@@ -2975,6 +2988,7 @@
int rc;
TableSpec spec;
StringBuffer schema;
+  int i;
FTSTRACE(("FTS3 Create\n"));

rc = parseSpec(&spec, argc, argv, pzErr);
@@ -2983,8 +2997,27 @@
initStringBuffer(&schema);
append(&schema, "CREATE TABLE %_content(");
append(&schema, "  docid INTEGER PRIMARY KEY,");
-  appendList(&schema, spec.nColumn, spec.azContentColumn);
-  append(&schema, ")");
+//  appendList(&schema, spec.nColumn, spec.azContentColumn);
+  for(i=0; i0 ){
+      append(&schema, ", ");
+    }
+    append(&schema, spec.azContentColumn[i]);
+    append(&schema, " blob");
+  }
+  append(&schema, ");");
+  // dummy compression code
+  append(&schema, "create trigger %_content_i after insert on %_content begin update t_content set ");
+  for(i=0; i0 ){
+      append(&schema, ", ");
+    }
+    append(&schema, spec.azContentColumn[i]);
+    append(&schema, "=compress(");
+    append(&schema, spec.azContentColumn[i]);
+    append(&schema, ")");
+  }
+  append(&schema, " where docid=NEW.docid; end;");
rc = sql_exec(db, spec.zDb, spec.zName, stringBufferData(&schema));
stringBufferDestroy(&schema);
if( rc!=SQLITE_OK ) goto out;
@@ -3900,6 +3933,7 @@
fulltext_cursor *c = (fulltext_cursor *) pCursor;
fulltext_vtab *v = cursor_vtab(c);
int rc;
+  int i;

FTSTRACE(("FTS3 Filter %p\n",pCursor));

@@ -3924,7 +3958,15 @@
StringBuffer sb;
initStringBuffer(&sb);
append(&sb, "SELECT docid, ");
-    appendList(&sb, v->nColumn, v->azContentColumn);
+//    appendList(&sb, v->nColumn, v->azContentColumn);
+    for(i=0; inColumn; ++i) {
+      if( i>0 ){
+        append(&sb, ", ");
+      }
+      append(&sb, "uncompress(");
+      append(&sb, v->azContentColumn[i]);
+      append(&sb, ")");
+    }
append(&sb, " FROM %_content");
if( idxNum!=QUERY_GENERIC ) append(&sb, " WHERE docid = ?");
rc = sql_prepare(v->db, v->zDb, v->zName, &c->pStmt,


Upd.
Tests of the Zlib-patched FTS3

The patch for the FTS3 extension with compression of the document only:
--- fts3.c 2009-09-05 00:37:41.000000000 +0400
+++ fts3.c.compress_content 2009-11-01 19:14:39.000000000 +0300
@@ -1958,8 +1958,10 @@
append(&sb, "insert into %_content (docid, ");
appendList(&sb, v->nColumn, v->azContentColumn);
append(&sb, ") values (?");
+//  append(&sb, ") values (compress(?)");
for(i=0; inColumn; ++i)
append(&sb, ", ?");
+//    append(&sb, ", compress(?)");
append(&sb, ")");
return stringBufferData(&sb);
}
@@ -1969,9 +1971,19 @@
*/
static const char *contentSelectStatement(fulltext_vtab *v){
StringBuffer sb;
+  int i;
+
initStringBuffer(&sb);
append(&sb, "SELECT ");
-  appendList(&sb, v->nColumn, v->azContentColumn);
+//  appendList(&sb, v->nColumn, v->azContentColumn);
+  for(i=0; inColumn; ++i) {
+    if( i>0 ){
+      append(&sb, ", ");
+    }
+    append(&sb, "uncompress(");
+    append(&sb, v->azContentColumn[i]);
+    append(&sb, ")");
+  }
append(&sb, " FROM %_content WHERE docid = ?");
return stringBufferData(&sb);
}
@@ -1991,7 +2003,8 @@
append(&sb, ", ");
}
append(&sb, v->azContentColumn[i]);
-    append(&sb, " = ?");
+//    append(&sb, " = ?");
+    append(&sb, " = compress(?)");
}
append(&sb, " where docid = ?");
return stringBufferData(&sb);
@@ -2975,6 +2988,7 @@
int rc;
TableSpec spec;
StringBuffer schema;
+  int i;
FTSTRACE(("FTS3 Create\n"));

rc = parseSpec(&spec, argc, argv, pzErr);
@@ -2983,8 +2997,27 @@
initStringBuffer(&schema);
append(&schema, "CREATE TABLE %_content(");
append(&schema, "  docid INTEGER PRIMARY KEY,");
-  appendList(&schema, spec.nColumn, spec.azContentColumn);
-  append(&schema, ")");
+//  appendList(&schema, spec.nColumn, spec.azContentColumn);
+  for(i=0; i0 ){
+      append(&schema, ", ");
+    }
+    append(&schema, spec.azContentColumn[i]);
+    append(&schema, " blob");
+  }
+  append(&schema, ");");
+  // dummy compression code
+  append(&schema, "create trigger %_content_i after insert on %_content begin update t_content set ");
+  for(i=0; i0 ){
+      append(&schema, ", ");
+    }
+    append(&schema, spec.azContentColumn[i]);
+    append(&schema, "=compress(");
+    append(&schema, spec.azContentColumn[i]);
+    append(&schema, ")");
+  }
+  append(&schema, " where docid=NEW.docid; end;");
rc = sql_exec(db, spec.zDb, spec.zName, stringBufferData(&schema));
stringBufferDestroy(&schema);
if( rc!=SQLITE_OK ) goto out;
@@ -3900,6 +3933,7 @@
fulltext_cursor *c = (fulltext_cursor *) pCursor;
fulltext_vtab *v = cursor_vtab(c);
int rc;
+  int i;

FTSTRACE(("FTS3 Filter %p\n",pCursor));

@@ -3924,7 +3958,15 @@
StringBuffer sb;
initStringBuffer(&sb);
append(&sb, "SELECT docid, ");
-    appendList(&sb, v->nColumn, v->azContentColumn);
+//    appendList(&sb, v->nColumn, v->azContentColumn);
+    for(i=0; inColumn; ++i) {
+      if( i>0 ){
+        append(&sb, ", ");
+      }
+      append(&sb, "uncompress(");
+      append(&sb, v->azContentColumn[i]);
+      append(&sb, ")");
+    }
append(&sb, " FROM %_content");
if( idxNum!=QUERY_GENERIC ) append(&sb, " WHERE docid = ?");
rc = sql_prepare(v->db, v->zDb, v->zName, &c->pStmt,

Upd.Results for upstream SQLite (without compression):
CREATE VIRTUAL TABLE file_text USING fts3(content, meta, TOKENIZE icu en_US);
CREATE TABLE 'file_text_content'(docid INTEGER PRIMARY KEY, 'c0content', 'c1meta');
CREATE TABLE 'file_text_segdir'(level INTEGER,idx INTEGER,start_block INTEGER,
    leaves_end_block INTEGER,end_block INTEGER,root BLOB,PRIMARY KEY(level, idx));
CREATE TABLE 'file_text_segments'(blockid INTEGER PRIMARY KEY, block BLOB);
CREATE INDEX file_dirname_idx on file(dirname);
sqlite> select total(length(root))/total(length(compress(root))) from file_text_segdir;
1.25312710154674
sqlite> select total(length(c0content))/total(length(compress(c0content))) from file_text_content;
3.825197339831
sqlite> select total(length(c1meta))/total(length(compress(c1meta))) from file_text_content;
0.762119156771678
sqlite> select total(length(block))/total(length(compress(block))) from file_text_segments;
3.09731296368889

Комментариев нет:


(C) Alexey Pechnikov aka MBG, mobigroup.ru