1# Run this TCL script using "testfixture" in order get a report that shows 2# how much disk space is used by a particular data to actually store data 3# versus how much space is unused. 4# 5 6# Get the name of the database to analyze 7# 8if {[llength $argv]!=1} { 9 puts stderr "Usage: $argv0 database-name" 10 exit 1 11} 12set file_to_analyze [lindex $argv 0] 13if {![file exists $file_to_analyze]} { 14 puts stderr "No such file: $file_to_analyze" 15 exit 1 16} 17if {![file readable $file_to_analyze]} { 18 puts stderr "File is not readable: $file_to_analyze" 19 exit 1 20} 21if {[file size $file_to_analyze]<2048} { 22 puts stderr "Empty or malformed database: $file_to_analyze" 23 exit 1 24} 25 26# Open the database 27# 28sqlite db [lindex $argv 0] 29set DB [btree_open [lindex $argv 0]] 30 31# In-memory database for collecting statistics 32# 33sqlite mem :memory: 34set tabledef\ 35{CREATE TABLE space_used( 36 name clob, -- Name of a table or index in the database file 37 tblname clob, -- Name of associated table 38 is_index boolean, -- TRUE if it is an index, false for a table 39 nentry int, -- Number of entries in the BTree 40 payload int, -- Total amount of data stored in this table or index 41 mx_payload int, -- Maximum payload size 42 n_ovfl int, -- Number of entries that overflow 43 pri_pages int, -- Number of primary pages used 44 ovfl_pages int, -- Number of overflow pages used 45 pri_unused int, -- Number of unused bytes on primary pages 46 ovfl_unused int -- Number of unused bytes on overflow pages 47);} 48mem eval $tabledef 49 50# This query will be used to find the root page number for every index and 51# table in the database. 52# 53set sql { 54 SELECT name, tbl_name, type, rootpage 55 FROM sqlite_master WHERE type IN ('table','index') 56 UNION ALL 57 SELECT 'sqlite_master', 'sqlite_master', 'table', 2 58 ORDER BY 1 59} 60 61# Analyze every table in the database, one at a time. 62# 63foreach {name tblname type rootpage} [db eval $sql] { 64 puts stderr "Analyzing $name..." 65 set cursor [btree_cursor $DB $rootpage 0] 66 set go [btree_first $cursor] 67 set size 0 68 catch {unset pg_used} 69 set unused_ovfl 0 70 set n_overflow 0 71 set cnt_ovfl 0 72 set n_entry 0 73 set mx_size 0 74 set pg_used($rootpage) 1016 75 while {$go==0} { 76 incr n_entry 77 set payload [btree_payload_size $cursor] 78 incr size $payload 79 set stat [btree_cursor_dump $cursor] 80 set pgno [lindex $stat 0] 81 set freebytes [lindex $stat 4] 82 set pg_used($pgno) $freebytes 83 if {$payload>236} { 84 # if {[lindex $stat 8]==0} {error "overflow is empty with $payload"} 85 set n [expr {($payload-236+1019)/1020}] 86 incr n_overflow $n 87 incr cnt_ovfl 88 incr unused_ovfl [expr {$n*1020+236-$payload}] 89 } else { 90 # if {[lindex $stat 8]!=0} {error "overflow not empty with $payload"} 91 } 92 if {$payload>$mx_size} {set mx_size $payload} 93 set go [btree_next $cursor] 94 } 95 btree_close_cursor $cursor 96 set n_primary [llength [array names pg_used]] 97 set unused_primary 0 98 foreach x [array names pg_used] {incr unused_primary $pg_used($x)} 99 regsub -all ' $name '' name 100 set sql "INSERT INTO space_used VALUES('$name'" 101 regsub -all ' $tblname '' tblname 102 append sql ",'$tblname',[expr {$type=="index"}],$n_entry" 103 append sql ",$size,$mx_size,$cnt_ovfl," 104 append sql "$n_primary,$n_overflow,$unused_primary,$unused_ovfl);" 105 mem eval $sql 106} 107 108# Generate a single line of output in the statistics section of the 109# report. 110# 111proc statline {title value {extra {}}} { 112 set len [string length $title] 113 set dots [string range {......................................} $len end] 114 set len [string length $value] 115 set sp2 [string range { } $len end] 116 if {$extra ne ""} { 117 set extra " $extra" 118 } 119 puts "$title$dots $value$sp2$extra" 120} 121 122# Generate a formatted percentage value for $num/$denom 123# 124proc percent {num denom} { 125 if {$denom==0.0} {return ""} 126 set v [expr {$num*100.0/$denom}] 127 if {$v>1.0 && $v<99.0} { 128 return [format %4.1f%% $v] 129 } elseif {$v<0.1 || $v>99.9} { 130 return [format %6.3f%% $v] 131 } else { 132 return [format %5.2f%% $v] 133 } 134} 135 136# Generate a subreport that covers some subset of the database. 137# the $where clause determines which subset to analyze. 138# 139proc subreport {title where} { 140 set hit 0 141 mem eval "SELECT sum(nentry) AS nentry, \ 142 sum(payload) AS payload, \ 143 sum(CASE is_index WHEN 1 THEN 0 ELSE payload-4*nentry END) \ 144 AS data, \ 145 max(mx_payload) AS mx_payload, \ 146 sum(n_ovfl) as n_ovfl, \ 147 sum(pri_pages) AS pri_pages, \ 148 sum(ovfl_pages) AS ovfl_pages, \ 149 sum(pri_unused) AS pri_unused, \ 150 sum(ovfl_unused) AS ovfl_unused \ 151 FROM space_used WHERE $where" {} {set hit 1} 152 if {!$hit} {return 0} 153 puts "" 154 set len [string length $title] 155 incr len 5 156 set stars "***********************************" 157 append stars $stars 158 set stars [string range $stars $len end] 159 puts "*** $title $stars" 160 puts "" 161 statline "Percentage of total database" \ 162 [percent [expr {$pri_pages+$ovfl_pages}] $::file_pgcnt] 163 statline "Number of entries" $nentry 164 set storage [expr {($pri_pages+$ovfl_pages)*1024}] 165 statline "Bytes of storage consumed" $storage 166 statline "Bytes of payload" $payload [percent $payload $storage] 167 statline "Bytes of data" $data [percent $data $storage] 168 set key [expr {$payload-$data}] 169 statline "Bytes of key" $key [percent $key $storage] 170 set avgpay [expr {$nentry>0?$payload/$nentry:0}] 171 statline "Average payload per entry" $avgpay 172 set avgunused [expr {$nentry>0?($pri_unused+$ovfl_unused)/$nentry:0}] 173 statline "Average unused bytes per entry" $avgunused 174 statline "Average fanout" \ 175 [format %.2f [expr {$pri_pages==0?0:($nentry+0.0)/$pri_pages}]] 176 statline "Maximum payload per entry" $mx_payload 177 statline "Entries that use overflow" $n_ovfl [percent $n_ovfl $nentry] 178 statline "Total pages used" [set allpgs [expr {$pri_pages+$ovfl_pages}]] 179 statline "Primary pages used" $pri_pages ;# [percent $pri_pages $allpgs] 180 statline "Overflow pages used" $ovfl_pages ;# [percent $ovfl_pages $allpgs] 181 statline "Unused bytes on primary pages" $pri_unused \ 182 [percent $pri_unused [expr {$pri_pages*1024}]] 183 statline "Unused bytes on overflow pages" $ovfl_unused \ 184 [percent $ovfl_unused [expr {$ovfl_pages*1024}]] 185 set allunused [expr {$ovfl_unused+$pri_unused}] 186 statline "Unused bytes on all pages" $allunused \ 187 [percent $allunused [expr {$allpgs*1024}]] 188 return 1 189} 190 191# Output summary statistics: 192# 193puts "/** Disk-Space Utilization Report For $file_to_analyze" 194puts "*** As of [clock format [clock seconds] -format {%Y-%b-%d %H:%M:%S}]" 195puts "" 196set fsize [file size [lindex $argv 0]] 197set file_pgcnt [expr {$fsize/1024}] 198set usedcnt [mem eval {SELECT sum(pri_pages+ovfl_pages) FROM space_used}] 199set freecnt [expr {$file_pgcnt-$usedcnt-1}] 200set freecnt2 [lindex [btree_get_meta $DB] 0] 201statline {Pages in the whole file (measured)} $file_pgcnt 202set file_pgcnt2 [expr {$usedcnt+$freecnt2+1}] 203statline {Pages in the whole file (calculated)} $file_pgcnt2 204statline {Pages that store data} $usedcnt [percent $usedcnt $file_pgcnt] 205statline {Pages on the freelist (per header)}\ 206 $freecnt2 [percent $freecnt2 $file_pgcnt] 207statline {Pages on the freelist (calculated)}\ 208 $freecnt [percent $freecnt $file_pgcnt] 209statline {Header pages} 1 [percent 1 $file_pgcnt] 210 211set ntable [db eval {SELECT count(*)+1 FROM sqlite_master WHERE type='table'}] 212statline {Number of tables in the database} $ntable 213set nindex [db eval {SELECT count(*) FROM sqlite_master WHERE type='index'}] 214set autoindex [db eval {SELECT count(*) FROM sqlite_master 215 WHERE type='index' AND name LIKE '(% autoindex %)'}] 216set manindex [expr {$nindex-$autoindex}] 217statline {Number of indices} $nindex 218statline {Number of named indices} $manindex [percent $manindex $nindex] 219statline {Automatically generated indices} $autoindex \ 220 [percent $autoindex $nindex] 221 222set bytes_data [mem eval "SELECT sum(payload-4*nentry) FROM space_used 223 WHERE NOT is_index AND name!='sqlite_master'"] 224set total_payload [mem eval "SELECT sum(payload) FROM space_used"] 225statline "Size of the file in bytes" $fsize 226statline "Bytes of payload stored" $total_payload \ 227 [percent $total_payload $fsize] 228statline "Bytes of user data stored" $bytes_data \ 229 [percent $bytes_data $fsize] 230 231# Output table rankings 232# 233puts "" 234puts "*** Page counts for all tables with their indices ********************" 235puts "" 236mem eval {SELECT tblname, count(*) AS cnt, sum(pri_pages+ovfl_pages) AS size 237 FROM space_used GROUP BY tblname ORDER BY size DESC, tblname} {} { 238 statline [string toupper $tblname] $size [percent $size $file_pgcnt] 239} 240 241# Output subreports 242# 243if {$nindex>0} { 244 subreport {All tables and indices} 1 245} 246subreport {All tables} {NOT is_index} 247if {$nindex>0} { 248 subreport {All indices} {is_index} 249} 250foreach tbl [mem eval {SELECT name FROM space_used WHERE NOT is_index 251 ORDER BY name}] { 252 regsub ' $tbl '' qn 253 set name [string toupper $tbl] 254 set n [mem eval "SELECT count(*) FROM space_used WHERE tblname='$qn'"] 255 if {$n>1} { 256 subreport "Table $name and all its indices" "tblname='$qn'" 257 subreport "Table $name w/o any indices" "name='$qn'" 258 subreport "Indices of table $name" "tblname='$qn' AND is_index" 259 } else { 260 subreport "Table $name" "name='$qn'" 261 } 262} 263 264# Output instructions on what the numbers above mean. 265# 266puts { 267*** Definitions ****************************************************** 268 269Number of pages in the whole file 270 271 The number of 1024-byte pages that go into forming the complete database 272 273Pages that store data 274 275 The number of pages that store data, either as primary B*Tree pages or 276 as overflow pages. The number at the right is the data pages divided by 277 the total number of pages in the file. 278 279Pages on the freelist 280 281 The number of pages that are not currently in use but are reserved for 282 future use. The percentage at the right is the number of freelist pages 283 divided by the total number of pages in the file. 284 285Header pages 286 287 The number of pages of header overhead in the database. This value is 288 always 1. The percentage at the right is the number of header pages 289 divided by the total number of pages in the file. 290 291Number of tables in the database 292 293 The number of tables in the database, including the SQLITE_MASTER table 294 used to store schema information. 295 296Number of indices 297 298 The total number of indices in the database. 299 300Number of named indices 301 302 The number of indices created using an explicit CREATE INDEX statement. 303 304Automatically generated indices 305 306 The number of indices used to implement PRIMARY KEY or UNIQUE constraints 307 on tables. 308 309Size of the file in bytes 310 311 The total amount of disk space used by the entire database files. 312 313Bytes of payload stored 314 315 The total number of bytes of payload stored in the database. Payload 316 includes both key and data. The content of the SQLITE_MASTER table is 317 counted when computing this number. The percentage at the right shows 318 the payload divided by the total file size. 319 320Bytes of user data stored 321 322 The total number of bytes of data stored in the database, not counting 323 the database schema information stored in the SQLITE_MASTER table. The 324 percentage at the right is the user data size divided by the total file 325 size. 326 327Percentage of total database 328 329 The amount of the complete database file that is devoted to storing 330 information described by this category. 331 332Number of entries 333 334 The total number of B*Tree key/value pairs stored under this category. 335 336Bytes of storage consumed 337 338 The total amount of disk space required to store all B*Tree entries 339 under this category. The is the total number of pages used times 340 the pages size (1024). 341 342Bytes of payload 343 344 The amount of payload stored under this category. Payload is the sum 345 of keys and data. Each table entry has 4 bytes of key and an arbitrary 346 amount of data. Each index entry has 4 or more bytes of key and no 347 data. The percentage at the right is the bytes of payload divided by 348 the bytes of storage consumed. 349 350Bytes of data 351 352 The amount of data stored under this category. The data space reported 353 includes formatting information such as nul-terminators and field-lengths 354 that are stored with the data. The percentage at the right is the bytes 355 of data divided by bytes of storage consumed. 356 357Bytes of key 358 359 The sum of the sizes of all keys under this category. The percentage at 360 the right is the bytes of key divided by the bytes of storage consumed. 361 362Average payload per entry 363 364 The average amount of payload on each entry. This is just the bytes of 365 payload divided by the number of entries. 366 367Average unused bytes per entry 368 369 The average amount of free space remaining on all pages under this 370 category on a per-entry basis. This is the number of unused bytes on 371 all pages divided by the number of entries. 372 373Maximum payload per entry 374 375 The largest payload size of any entry. 376 377Entries that use overflow 378 379 Up to 236 bytes of payload for each entry are stored directly in the 380 primary B*Tree page. Any additional payload is stored on a linked list 381 of overflow pages. This is the number of entries that exceed 236 bytes 382 in size. The value to the right is the number of entries that overflow 383 divided by the total number of entries. 384 385Total pages used 386 387 This is the number of 1024 byte pages used to hold all information in 388 the current category. This is the sum of primary and overflow pages. 389 390Primary pages used 391 392 This is the number of primary B*Tree pages used. 393 394Overflow pages used 395 396 The total number of overflow pages used for this category. 397 398Unused bytes on primary pages 399 400 The total number of bytes of unused space on all primary pages. The 401 percentage at the right is the number of unused bytes divided by the 402 total number of bytes on primary pages. 403 404Unused bytes on overflow pages 405 406 The total number of bytes of unused space on all overflow pages. The 407 percentage at the right is the number of unused bytes divided by the 408 total number of bytes on overflow pages. 409 410Unused bytes on all pages 411 412 The total number of bytes of unused space on all primary and overflow 413 pages. The percentage at the right is the number of unused bytes 414 divided by the total number of bytes. 415} 416 417# Output the database 418# 419puts "**********************************************************************" 420puts "The entire text of this report can be sourced into any SQL database" 421puts "engine for further analysis. All of the text above is an SQL comment." 422puts "The data used to generate this report follows:" 423puts "*/" 424puts "BEGIN;" 425puts $tabledef 426unset -nocomplain x 427mem eval {SELECT * FROM space_used} x { 428 puts -nonewline "INSERT INTO space_used VALUES(" 429 regsub ' $x(name) '' qn 430 regsub ' $x(tblname) '' qtn 431 puts -nonewline "'$qn','$qtn'," 432 puts -nonewline "$x(is_index),$x(nentry),$x(payload),$x(mx_payload)," 433 puts -nonewline "$x(n_ovfl),$x(pri_pages),$x(ovfl_pages),$x(pri_unused)," 434 puts "$x(ovfl_unused));" 435} 436puts "COMMIT;" 437