#pragma ident "%Z%%M% %I% %E% SMI" # Run this TCL script using "testfixture" in order get a report that shows # how much disk space is used by a particular data to actually store data # versus how much space is unused. # # Get the name of the database to analyze # if {[llength $argv]!=1} { puts stderr "Usage: $argv0 database-name" exit 1 } set file_to_analyze [lindex $argv 0] if {![file exists $file_to_analyze]} { puts stderr "No such file: $file_to_analyze" exit 1 } if {![file readable $file_to_analyze]} { puts stderr "File is not readable: $file_to_analyze" exit 1 } if {[file size $file_to_analyze]<2048} { puts stderr "Empty or malformed database: $file_to_analyze" exit 1 } # Open the database # sqlite db [lindex $argv 0] set DB [btree_open [lindex $argv 0]] # In-memory database for collecting statistics # sqlite mem :memory: set tabledef\ {CREATE TABLE space_used( name clob, -- Name of a table or index in the database file tblname clob, -- Name of associated table is_index boolean, -- TRUE if it is an index, false for a table nentry int, -- Number of entries in the BTree payload int, -- Total amount of data stored in this table or index mx_payload int, -- Maximum payload size n_ovfl int, -- Number of entries that overflow pri_pages int, -- Number of primary pages used ovfl_pages int, -- Number of overflow pages used pri_unused int, -- Number of unused bytes on primary pages ovfl_unused int -- Number of unused bytes on overflow pages );} mem eval $tabledef # This query will be used to find the root page number for every index and # table in the database. # set sql { SELECT name, tbl_name, type, rootpage FROM sqlite_master WHERE type IN ('table','index') UNION ALL SELECT 'sqlite_master', 'sqlite_master', 'table', 2 ORDER BY 1 } # Analyze every table in the database, one at a time. # foreach {name tblname type rootpage} [db eval $sql] { puts stderr "Analyzing $name..." set cursor [btree_cursor $DB $rootpage 0] set go [btree_first $cursor] set size 0 catch {unset pg_used} set unused_ovfl 0 set n_overflow 0 set cnt_ovfl 0 set n_entry 0 set mx_size 0 set pg_used($rootpage) 1016 while {$go==0} { incr n_entry set payload [btree_payload_size $cursor] incr size $payload set stat [btree_cursor_dump $cursor] set pgno [lindex $stat 0] set freebytes [lindex $stat 4] set pg_used($pgno) $freebytes if {$payload>236} { # if {[lindex $stat 8]==0} {error "overflow is empty with $payload"} set n [expr {($payload-236+1019)/1020}] incr n_overflow $n incr cnt_ovfl incr unused_ovfl [expr {$n*1020+236-$payload}] } else { # if {[lindex $stat 8]!=0} {error "overflow not empty with $payload"} } if {$payload>$mx_size} {set mx_size $payload} set go [btree_next $cursor] } btree_close_cursor $cursor set n_primary [llength [array names pg_used]] set unused_primary 0 foreach x [array names pg_used] {incr unused_primary $pg_used($x)} regsub -all ' $name '' name set sql "INSERT INTO space_used VALUES('$name'" regsub -all ' $tblname '' tblname append sql ",'$tblname',[expr {$type=="index"}],$n_entry" append sql ",$size,$mx_size,$cnt_ovfl," append sql "$n_primary,$n_overflow,$unused_primary,$unused_ovfl);" mem eval $sql } # Generate a single line of output in the statistics section of the # report. # proc statline {title value {extra {}}} { set len [string length $title] set dots [string range {......................................} $len end] set len [string length $value] set sp2 [string range { } $len end] if {$extra ne ""} { set extra " $extra" } puts "$title$dots $value$sp2$extra" } # Generate a formatted percentage value for $num/$denom # proc percent {num denom} { if {$denom==0.0} {return ""} set v [expr {$num*100.0/$denom}] if {$v>1.0 && $v<99.0} { return [format %4.1f%% $v] } elseif {$v<0.1 || $v>99.9} { return [format %6.3f%% $v] } else { return [format %5.2f%% $v] } } # Generate a subreport that covers some subset of the database. # the $where clause determines which subset to analyze. # proc subreport {title where} { set hit 0 mem eval "SELECT sum(nentry) AS nentry, \ sum(payload) AS payload, \ sum(CASE is_index WHEN 1 THEN 0 ELSE payload-4*nentry END) \ AS data, \ max(mx_payload) AS mx_payload, \ sum(n_ovfl) as n_ovfl, \ sum(pri_pages) AS pri_pages, \ sum(ovfl_pages) AS ovfl_pages, \ sum(pri_unused) AS pri_unused, \ sum(ovfl_unused) AS ovfl_unused \ FROM space_used WHERE $where" {} {set hit 1} if {!$hit} {return 0} puts "" set len [string length $title] incr len 5 set stars "***********************************" append stars $stars set stars [string range $stars $len end] puts "*** $title $stars" puts "" statline "Percentage of total database" \ [percent [expr {$pri_pages+$ovfl_pages}] $::file_pgcnt] statline "Number of entries" $nentry set storage [expr {($pri_pages+$ovfl_pages)*1024}] statline "Bytes of storage consumed" $storage statline "Bytes of payload" $payload [percent $payload $storage] statline "Bytes of data" $data [percent $data $storage] set key [expr {$payload-$data}] statline "Bytes of key" $key [percent $key $storage] set avgpay [expr {$nentry>0?$payload/$nentry:0}] statline "Average payload per entry" $avgpay set avgunused [expr {$nentry>0?($pri_unused+$ovfl_unused)/$nentry:0}] statline "Average unused bytes per entry" $avgunused statline "Average fanout" \ [format %.2f [expr {$pri_pages==0?0:($nentry+0.0)/$pri_pages}]] statline "Maximum payload per entry" $mx_payload statline "Entries that use overflow" $n_ovfl [percent $n_ovfl $nentry] statline "Total pages used" [set allpgs [expr {$pri_pages+$ovfl_pages}]] statline "Primary pages used" $pri_pages ;# [percent $pri_pages $allpgs] statline "Overflow pages used" $ovfl_pages ;# [percent $ovfl_pages $allpgs] statline "Unused bytes on primary pages" $pri_unused \ [percent $pri_unused [expr {$pri_pages*1024}]] statline "Unused bytes on overflow pages" $ovfl_unused \ [percent $ovfl_unused [expr {$ovfl_pages*1024}]] set allunused [expr {$ovfl_unused+$pri_unused}] statline "Unused bytes on all pages" $allunused \ [percent $allunused [expr {$allpgs*1024}]] return 1 } # Output summary statistics: # puts "/** Disk-Space Utilization Report For $file_to_analyze" puts "*** As of [clock format [clock seconds] -format {%Y-%b-%d %H:%M:%S}]" puts "" set fsize [file size [lindex $argv 0]] set file_pgcnt [expr {$fsize/1024}] set usedcnt [mem eval {SELECT sum(pri_pages+ovfl_pages) FROM space_used}] set freecnt [expr {$file_pgcnt-$usedcnt-1}] set freecnt2 [lindex [btree_get_meta $DB] 0] statline {Pages in the whole file (measured)} $file_pgcnt set file_pgcnt2 [expr {$usedcnt+$freecnt2+1}] statline {Pages in the whole file (calculated)} $file_pgcnt2 statline {Pages that store data} $usedcnt [percent $usedcnt $file_pgcnt] statline {Pages on the freelist (per header)}\ $freecnt2 [percent $freecnt2 $file_pgcnt] statline {Pages on the freelist (calculated)}\ $freecnt [percent $freecnt $file_pgcnt] statline {Header pages} 1 [percent 1 $file_pgcnt] set ntable [db eval {SELECT count(*)+1 FROM sqlite_master WHERE type='table'}] statline {Number of tables in the database} $ntable set nindex [db eval {SELECT count(*) FROM sqlite_master WHERE type='index'}] set autoindex [db eval {SELECT count(*) FROM sqlite_master WHERE type='index' AND name LIKE '(% autoindex %)'}] set manindex [expr {$nindex-$autoindex}] statline {Number of indices} $nindex statline {Number of named indices} $manindex [percent $manindex $nindex] statline {Automatically generated indices} $autoindex \ [percent $autoindex $nindex] set bytes_data [mem eval "SELECT sum(payload-4*nentry) FROM space_used WHERE NOT is_index AND name!='sqlite_master'"] set total_payload [mem eval "SELECT sum(payload) FROM space_used"] statline "Size of the file in bytes" $fsize statline "Bytes of payload stored" $total_payload \ [percent $total_payload $fsize] statline "Bytes of user data stored" $bytes_data \ [percent $bytes_data $fsize] # Output table rankings # puts "" puts "*** Page counts for all tables with their indices ********************" puts "" mem eval {SELECT tblname, count(*) AS cnt, sum(pri_pages+ovfl_pages) AS size FROM space_used GROUP BY tblname ORDER BY size DESC, tblname} {} { statline [string toupper $tblname] $size [percent $size $file_pgcnt] } # Output subreports # if {$nindex>0} { subreport {All tables and indices} 1 } subreport {All tables} {NOT is_index} if {$nindex>0} { subreport {All indices} {is_index} } foreach tbl [mem eval {SELECT name FROM space_used WHERE NOT is_index ORDER BY name}] { regsub ' $tbl '' qn set name [string toupper $tbl] set n [mem eval "SELECT count(*) FROM space_used WHERE tblname='$qn'"] if {$n>1} { subreport "Table $name and all its indices" "tblname='$qn'" subreport "Table $name w/o any indices" "name='$qn'" subreport "Indices of table $name" "tblname='$qn' AND is_index" } else { subreport "Table $name" "name='$qn'" } } # Output instructions on what the numbers above mean. # puts { *** Definitions ****************************************************** Number of pages in the whole file The number of 1024-byte pages that go into forming the complete database Pages that store data The number of pages that store data, either as primary B*Tree pages or as overflow pages. The number at the right is the data pages divided by the total number of pages in the file. Pages on the freelist The number of pages that are not currently in use but are reserved for future use. The percentage at the right is the number of freelist pages divided by the total number of pages in the file. Header pages The number of pages of header overhead in the database. This value is always 1. The percentage at the right is the number of header pages divided by the total number of pages in the file. Number of tables in the database The number of tables in the database, including the SQLITE_MASTER table used to store schema information. Number of indices The total number of indices in the database. Number of named indices The number of indices created using an explicit CREATE INDEX statement. Automatically generated indices The number of indices used to implement PRIMARY KEY or UNIQUE constraints on tables. Size of the file in bytes The total amount of disk space used by the entire database files. Bytes of payload stored The total number of bytes of payload stored in the database. Payload includes both key and data. The content of the SQLITE_MASTER table is counted when computing this number. The percentage at the right shows the payload divided by the total file size. Bytes of user data stored The total number of bytes of data stored in the database, not counting the database schema information stored in the SQLITE_MASTER table. The percentage at the right is the user data size divided by the total file size. Percentage of total database The amount of the complete database file that is devoted to storing information described by this category. Number of entries The total number of B*Tree key/value pairs stored under this category. Bytes of storage consumed The total amount of disk space required to store all B*Tree entries under this category. The is the total number of pages used times the pages size (1024). Bytes of payload The amount of payload stored under this category. Payload is the sum of keys and data. Each table entry has 4 bytes of key and an arbitrary amount of data. Each index entry has 4 or more bytes of key and no data. The percentage at the right is the bytes of payload divided by the bytes of storage consumed. Bytes of data The amount of data stored under this category. The data space reported includes formatting information such as nul-terminators and field-lengths that are stored with the data. The percentage at the right is the bytes of data divided by bytes of storage consumed. Bytes of key The sum of the sizes of all keys under this category. The percentage at the right is the bytes of key divided by the bytes of storage consumed. Average payload per entry The average amount of payload on each entry. This is just the bytes of payload divided by the number of entries. Average unused bytes per entry The average amount of free space remaining on all pages under this category on a per-entry basis. This is the number of unused bytes on all pages divided by the number of entries. Maximum payload per entry The largest payload size of any entry. Entries that use overflow Up to 236 bytes of payload for each entry are stored directly in the primary B*Tree page. Any additional payload is stored on a linked list of overflow pages. This is the number of entries that exceed 236 bytes in size. The value to the right is the number of entries that overflow divided by the total number of entries. Total pages used This is the number of 1024 byte pages used to hold all information in the current category. This is the sum of primary and overflow pages. Primary pages used This is the number of primary B*Tree pages used. Overflow pages used The total number of overflow pages used for this category. Unused bytes on primary pages The total number of bytes of unused space on all primary pages. The percentage at the right is the number of unused bytes divided by the total number of bytes on primary pages. Unused bytes on overflow pages The total number of bytes of unused space on all overflow pages. The percentage at the right is the number of unused bytes divided by the total number of bytes on overflow pages. Unused bytes on all pages The total number of bytes of unused space on all primary and overflow pages. The percentage at the right is the number of unused bytes divided by the total number of bytes. } # Output the database # puts "**********************************************************************" puts "The entire text of this report can be sourced into any SQL database" puts "engine for further analysis. All of the text above is an SQL comment." puts "The data used to generate this report follows:" puts "*/" puts "BEGIN;" puts $tabledef unset -nocomplain x mem eval {SELECT * FROM space_used} x { puts -nonewline "INSERT INTO space_used VALUES(" regsub ' $x(name) '' qn regsub ' $x(tblname) '' qtn puts -nonewline "'$qn','$qtn'," puts -nonewline "$x(is_index),$x(nentry),$x(payload),$x(mx_payload)," puts -nonewline "$x(n_ovfl),$x(pri_pages),$x(ovfl_pages),$x(pri_unused)," puts "$x(ovfl_unused));" } puts "COMMIT;"