понедельник, 2 ноября 2009 г.

The Tcl documents scanner for FTS3

The useful scanner may collect as documents body as meta information. For start we can store the type, size and some checksum for original file.


#!/usr/bin/tclsh8.5
# find /mnt/backup/project/offline1/www/share | ./scan.tcl /mnt/backup/project/offline1/www/share
package require sqlite3
catch {file delete scan.db}
sqlite3 db scan.db
#sqlite3 db :memory:
db eval {
CREATE TABLE file (
id INTEGER PRIMARY KEY AUTOINCREMENT NOT NULL,
save_date REAL NOT NULL DEFAULT (julianday('now')),
delete_date REAL,
checksum text not null,
uri text not null,
size int not null,
mime text not null default ''
);
CREATE VIRTUAL TABLE file_text USING fts3(content, TOKENIZE icu ru_RU);
}
# cut the root directory name from file names
set root [lindex $argv 0]
while {[eof stdin] == 0} {
set file [gets stdin]
catch {
if {[file type $file] ne {file}} continue
set mime [exec file --brief --mime-type $file]
if {[file exists ./filters/${mime}_filter]} {
set md5 [string range [exec md5sum $file] 0 31]
set text [exec ./filters/${mime}_filter $file]
set size [file size $file]
if {$root ne {} && [string range $file 0 [string length $root]-1] eq $root} {
set file [string range $file [string length $root] end]
}
puts "$file => $mime => $size"
db eval {insert into file (checksum,uri,size,mime) values ($md5,$file,$size,$mime)}
db eval {insert into file_text (rowid,content) values (last_insert_rowid(),$text)}
}
}
}
db eval {vacuum}


The simple search query with the text results can be writed like as:

select uri,mime,size,snippet(file_text, '[', ']', '%%')
from file_text,file
where file_text.rowid=file.rowid and file_text match 'коды def';

/Uslugi1.html|text/html|35621|%% информация по кодам [DEF]

[Коды] [DEF]

Автоматические %%
/Uslugi.html|text/html|32928|%% информация по кодам [DEF]

[Коды] [DEF]

Автоматические %%



And with the HTML results:

select uri,mime,size,snippet(file_text)
from file_text,file
where file_text.rowid=file.rowid and file_text match 'коды def';


/Uslugi1.html|text/html|35621|... информация по кодам DEF

Коды DEF

Автоматические ...
/Uslugi.html|text/html|32928|... информация по кодам DEF

Коды DEF

Автоматические ...

Комментариев нет:


(C) Alexey Pechnikov aka MBG, mobigroup.ru