############################################################################# # using log scores and DIC to compare the gaussian and t sampling models # for the nb10 case study # all lines below *not* beginning with the R comment character # # are valid lines of code that should be run in the order they appear # the code below assumes that you've downloaded JAGS to your machine # unbuffer the output (if relevant) # (1) create a directory into which you can download all relevant files # from the course web page; in what follows i'll call it by the name # i use on my home desktop, # setwd( 'C:/DD/Teaching/AMS-STAT-206/Winter-2020/NB10-Log-Scores' ) # (2) download the gaussian and t model files from the course web page # into the directory in (1); they're called # nb10-gaussian-model.txt # nb10-t-model.txt # the gaussian model looks like this (without the # symbols): # model { # mu ~ dnorm( 0.0, 1.0E-6 ) # sigma ~ dunif( 0.0, 10.0 ) # for ( i in 1:n ) { # y[ i ] ~ dnorm( mu, tau ) # } # tau <- 1 / ( sigma * sigma ) # y.new ~ dnorm( mu, tau ) # } # and the t model looks like this: # model { # mu ~ dnorm( 0.0, 1.0E-6 ) # sigma ~ dunif( 0.0, 10.0 ) # nu ~ dunif( 2.0, 12.0 ) # for ( i in 1:n ) { # y[ i ] ~ dt( mu, tau, nu ) # } # tau <- 1 / ( sigma * sigma ) # y.new ~ dt( mu, tau, nu ) # } # (3) start an R session and change directory to the location in (1) # (4) the next code block installs the CRAN package 'rjags', # if it's not already installed on your machine, and loads it into # your current R session dynamic.require <- function( package ) { if ( eval( parse( text = paste( 'require(', package, ')' ) ) ) ) { return( 'done' ) } install.packages( package ) return( eval( parse( text = paste( 'require(', package, ')' ) ) ) ) } dynamic.require( 'rjags' ) # (5) now enter the NB10 data set as a list and assign it to an object; # here i've called it nb10.data : nb10.data <- list( y = c( 409., 400., 406., 399., 402., 406., 401., 403., 401., 403., 398., 403., 407., 402., 401., 399., 400., 401., 405., 402., 408., 399., 399., 402., 399., 397., 407., 401., 399., 401., 403., 400., 410., 401., 407., 423., 406., 406., 402., 405., 405., 409., 399., 402., 407., 406., 413., 409., 404., 402., 404., 406., 407., 405., 411., 410., 410., 410., 401., 402., 404., 405., 392., 407., 406., 404., 403., 408., 404., 407., 412., 406., 409., 400., 408., 404., 401., 404., 408., 406., 408., 406., 401., 412., 393., 437., 418., 415., 404., 401., 401., 407., 412., 375., 409., 406., 398., 406., 403., 404. ), n = 100 ) y <- nb10.data$y n <- nb10.data$n # (6) let's fit the gaussian model first nb10.gaussian.initial.values <- list( mu = mean( y ), sigma = sd( y ) ) # (7) we can do model checking and compiling with the following # function call: nb10.gaussian.fit <- jags.model( file = 'nb10-gaussian-model.txt', data = nb10.data, inits = nb10.gaussian.initial.values, n.chains = 2 ) # the DIC computation by a 'rjags' function below requires # 2 or more parallel mcmc output chains to be run # Compiling model graph # . # . # . # |+++++++++++++++++++++++++++++++++++++++++++++++++++++| 100% # (8) before doing the random MCMC sampling, it's good form to set the # random number seed, so that other people running your code get # the same results you do: set.seed( 314159 ) # (9) next we can do a burn-in of (e.g.) 1000 iterations from your # initial values with the following commands: n.burnin <- 1000 system.time( update( nb10.gaussian.fit, n.iter = n.burnin ) ) # |*************************************************| 100% # on my desktop, with an Intel Core i7 3770 at 3.40GHz, # single-threaded, 1000 burn-in iterations took about 0.4 seconds # (10) now we can do a monitoring run of (e.g.) 100000 iterations with # the following command (at decent GHz this will take about # 4 seconds): n.monitor <- 100000 system.time( nb10.gaussian.results <- jags.samples( nb10.gaussian.fit, variable.names = c( 'mu', 'sigma', 'y.new' ), n.iter = n.monitor ) ) # user system elapsed # 7.29 0.39 7.69 system.time( nb10.gaussian.dic.results <- dic.samples( nb10.gaussian.fit, n.iter = n.monitor, thin = 1, type = 'pD' ) ) # user system elapsed # 9.34 0.26 9.61 print( nb10.gaussian.dic.results ) # Mean deviance: 658.2 # penalty 2.047 # Penalized deviance: 660.2 # here 'penalty' = pD and 'Penalized deviance' = DIC # rjags has correctly computed (up to monte carlo noise) that # there are 2 parameters in the model (mu and sigma), by arriving # at pD = 2.047, and the DIC value for the gaussian model is 660.2 # (11) now we're ready to do the same steps with the t model nb10.t.initial.values <- list( mu = mean( y ), sigma = sd( y ), nu = 3.0 ) nb10.t.fit <- jags.model( file = 'nb10-t-model.txt', data = nb10.data, inits = nb10.t.initial.values, n.chains = 2 ) set.seed( 314159 ) n.burnin <- 1000 system.time( update( nb10.t.fit, n.iter = n.burnin ) ) n.monitor <- 100000 system.time( nb10.t.results <- jags.samples( nb10.t.fit, variable.names = c( 'mu', 'sigma', 'nu', 'y.new' ), n.iter = n.monitor ) ) system.time( nb10.t.dic.results <- dic.samples( nb10.t.fit, n.iter = n.monitor, thin = 1, type = 'pD' ) ) print( nb10.t.dic.results ) # Mean deviance: 619.4 # penalty 2.637 # Penalized deviance: 622 # as before, 'penalty' = pD and 'Penalized deviance' = DIC # rjags has estimated the number of parameters in this model # to be 2.637; we know that it's 3, so this is not a terrific # value of pD # the theory behind why the pD value in DIC is sometimes inaccurate # is fairly well understood; i tried a different parameterization # in the t model that's supposed to force pD to behave better, # and its value went up to 2.82 # in any case the resulting DIC value of about 622 is (vastly) smaller # than the 660.2 DIC value for the gaussian model; # in this case study, DIC has correctly concluded that the # t model is better than the gaussian # (12) now let's work on log scores for the two models # it's hard to find a CRAN package that computes log scores # for an arbitrary model, so let's do it ourselves # to illustrate the process # we're implementing equation (76) on page 37 of part 11 # of the lecture notes # get the mcmc draws for ( mu, sigma ) in the gaussian model, # and the draws for ( mu, sigma, nu ) in the t model gaussian.mu.star <- nb10.gaussian.results$mu gaussian.sigma.star <- nb10.gaussian.results$sigma t.mu.star <- nb10.t.results$mu t.sigma.star <- nb10.t.results$sigma t.nu.star <- nb10.t.results$nu # sort y, to make the log score results more interpretable y <- sort( y ) # we need the dnorm function to evaluate the gaussian density, # and we also need a density function for the scaled t distribution: dt.scaled <- function( y, mu, sigma, nu ) { return( exp( lgamma( ( nu + 1 ) / 2 ) - ( ( nu + 1 ) / 2 ) * log( 1 + ( y - mu )^2 / ( nu * sigma^2 ) ) - lgamma( nu / 2 ) - log( nu * pi ) / 2 - log( sigma ) ) ) } log.score.contributions <- matrix( NA, n, 2 ) system.time( { for ( i in 1:n ) { log.score.contributions[ i, 1 ] <- log( mean( dnorm( y[ i ], gaussian.mu.star, gaussian.sigma.star ) ) ) log.score.contributions[ i, 2 ] <- log( mean( dt.scaled( y[ i ], t.mu.star, t.sigma.star, t.nu.star ) ) ) } } ) # user system elapsed # 9.19 0.05 9.24 print( log.scores <- apply( log.score.contributions, 2, mean ) ) # gaussian t # [1] -3.262099 -3.082312 # log scores, as with DIC in this case study, conclude # that the t model is better than the gaussian (recall that # large log scores are better than small ones) # the log score scale is hard to interpret intuitively; # a natural question is # how much higher does the log score for model 2 have to be # than the log score for model 1 to declare model 2 # 'significantly' better? # this question is not easy to answer # let's look at the individual data point contributions # to the overall log scores (recall that the data vector # was sorted from smallest to largest previously) cbind( y, log.score.contributions, 0 + log.score.contributions[ , 2 ] > log.score.contributions[ , 1 ] ) # t # better # than # y gaussian t G # [1,] 375 -12.198117 -8.588664 1 # [2,] 392 -4.638698 -5.351899 0 # [3,] 393 -4.362375 -5.079224 0 # . # . # . # [12,] 399 -3.166777 -3.307888 0 # [13,] 399 -3.166777 -3.307888 0 # [14,] 400 -3.047119 -3.028850 1 # [15,] 400 -3.047119 -3.028850 1 # . # . # . # [81,] 408 -2.935951 -2.875571 1 # [82,] 408 -2.935951 -2.875571 1 # [83,] 409 -3.028124 -3.138244 0 # [84,] 409 -3.028124 -3.138244 0 # . # . # . # [98,] 418 -4.881877 -5.712829 0 # [99,] 423 -6.655059 -6.848403 0 # [100,] 437 -13.889084 -9.018675 1 # let's visualize the results in the table plot( y, log.score.contributions[ , 1 ], ylim = c( min( log.score.contributions ), max( log.score.contributions ) ), ylab = 'Log Score Contributions', col = 'darkblue', pch = 16 ) lines( y, log.score.contributions[ , 1 ], lty = 1, lwd = 2, col = 'darkblue' ) points( y, log.score.contributions[ , 2 ], col = 'darkcyan', pch = 17 ) lines( y, log.score.contributions[ , 2 ], lty = 2, lwd = 2, col = 'darkcyan' ) legend( 405, -10, c( 'Gaussian', 't' ), pch = c( 16, 17 ) ) # the t model fits better both in the tails (where the most # influential observations are from the gaussian point of # view) and in the center (where most of the data values are) mean( log.score.contributions[ , 2 ] > log.score.contributions[ , 1 ] ) # [1] 0.71 # the t model fits 71% of the data points better than the gaussian # note finally that # the fact that the log score for the t model is bigger # than the log score for the gaussian does not mean that # the t model fits the data well; it just means that # the t model fits the data better than the gaussian model does # we need another method to answer the question # is the t model good enough to stop looking for a better model? ############################################################################