#############################################################################

# using log scores and DIC to compare the gaussian and t sampling models
# for the nb10 case study

# all lines below *not* beginning with the R comment character #
# are valid lines of code that should be run in the order they appear

# the code below assumes that you've downloaded JAGS to your machine

# unbuffer the output (if relevant)

# (1) create a directory into which you can download all relevant files
#     from the course web page; in what follows i'll call it by the name
#     i use on my home desktop,

#       setwd( 'C:/DD/Teaching/AMS-STAT-206/Winter-2020/NB10-Log-Scores' )

# (2) download the gaussian and t model files from the course web page
#     into the directory in (1); they're called

#       nb10-gaussian-model.txt
#       nb10-t-model.txt

# the gaussian model looks like this (without the # symbols):

# model {

#   mu ~ dnorm( 0.0, 1.0E-6 )
#   sigma ~ dunif( 0.0, 10.0 )
 
#   for ( i in 1:n ) {

#     y[ i ] ~ dnorm( mu, tau )
 
#   }

#   tau <- 1 / ( sigma * sigma )
#   y.new ~ dnorm( mu, tau )
 
# }

# and the t model looks like this:

# model {

#   mu ~ dnorm( 0.0, 1.0E-6 )
#   sigma ~ dunif( 0.0, 10.0 )
#   nu ~ dunif( 2.0, 12.0 )
 
#   for ( i in 1:n ) {

#     y[ i ] ~ dt( mu, tau, nu )
 
#   }

#   tau <- 1 / ( sigma * sigma )
#   y.new ~ dt( mu, tau, nu )
 
# }

# (3) start an R session and change directory to the location in (1)

# (4) the next code block installs the CRAN package 'rjags',
#     if it's not already installed on your machine, and loads it into
#     your current R session

dynamic.require <- function( package ) {

  if ( eval( parse( text = paste( 'require(', package, ')' ) ) ) ) {

    return( 'done' )

  }

  install.packages( package )

  return( eval( parse( text = paste( 'require(', package, ')' ) ) ) )

}

dynamic.require( 'rjags' )

# (5) now enter the NB10 data set as a list and assign it to an object;
# here i've called it nb10.data :

nb10.data <- list( 

  y = c( 409., 400., 406., 399., 402., 406., 401., 403., 401., 403.,
         398., 403., 407., 402., 401., 399., 400., 401., 405., 402., 
         408., 399., 399., 402., 399., 397., 407., 401., 399., 401., 
         403., 400., 410., 401., 407., 423., 406., 406., 402., 405., 
         405., 409., 399., 402., 407., 406., 413., 409., 404., 402., 
         404., 406., 407., 405., 411., 410., 410., 410., 401., 402., 
         404., 405., 392., 407., 406., 404., 403., 408., 404., 407., 
         412., 406., 409., 400., 408., 404., 401., 404., 408., 406., 
         408., 406., 401., 412., 393., 437., 418., 415., 404., 401., 
         401., 407., 412., 375., 409., 406., 398., 406., 403., 404. ), 

  n = 100 )

y <- nb10.data$y

n <- nb10.data$n

# (6) let's fit the gaussian model first

nb10.gaussian.initial.values <- list( mu = mean( y ), sigma = sd( y ) )

# (7) we can do model checking and compiling with the following 
#     function call:

nb10.gaussian.fit <- jags.model( file = 'nb10-gaussian-model.txt', 
  data = nb10.data, inits = nb10.gaussian.initial.values,
  n.chains = 2 )

# the DIC computation by a 'rjags' function below requires
# 2 or more parallel mcmc output chains to be run

# Compiling model graph

#   .
#   .
#   .

#   |+++++++++++++++++++++++++++++++++++++++++++++++++++++| 100%                                                                                                   

# (8) before doing the random MCMC sampling, it's good form to set the
#     random number seed, so that other people running your code get
#     the same results you do:

set.seed( 314159 )

# (9) next we can do a burn-in of (e.g.) 1000 iterations from your
#     initial values with the following commands:

n.burnin <- 1000

system.time( 

  update( nb10.gaussian.fit, n.iter = n.burnin )

)

#   |*************************************************| 100% 

#      on my desktop, with an Intel Core i7 3770 at 3.40GHz, 
#      single-threaded, 1000 burn-in iterations took about 0.4 seconds

# (10) now we can do a monitoring run of (e.g.) 100000 iterations with
#      the following command (at decent GHz this will take about 
#      4 seconds):

n.monitor <- 100000

system.time( 

  nb10.gaussian.results <- jags.samples( nb10.gaussian.fit, 
    variable.names = c( 'mu', 'sigma', 'y.new' ), 
    n.iter = n.monitor )

)

#   user  system elapsed 
#   7.29    0.39    7.69

system.time(

  nb10.gaussian.dic.results <- dic.samples( nb10.gaussian.fit, 
    n.iter = n.monitor, thin = 1, type = 'pD' )

)

#   user  system elapsed 
#   9.34    0.26    9.61 

print( nb10.gaussian.dic.results )

# Mean deviance:  658.2 
# penalty 2.047 
# Penalized deviance: 660.2

# here 'penalty' = pD and 'Penalized deviance' = DIC

# rjags has correctly computed (up to monte carlo noise) that 
# there are 2 parameters in the model (mu and sigma), by arriving
# at pD = 2.047, and the DIC value for the gaussian model is 660.2

# (11) now we're ready to do the same steps with the t model

nb10.t.initial.values <- list( mu = mean( y ), sigma = sd( y ),
  nu = 3.0 )

nb10.t.fit <- jags.model( file = 'nb10-t-model.txt', 
  data = nb10.data, inits = nb10.t.initial.values,
  n.chains = 2 )

set.seed( 314159 )

n.burnin <- 1000

system.time( 

  update( nb10.t.fit, n.iter = n.burnin )

)

n.monitor <- 100000

system.time( 

  nb10.t.results <- jags.samples( nb10.t.fit, 
    variable.names = c( 'mu', 'sigma', 'nu', 'y.new' ), 
    n.iter = n.monitor )

)

system.time(

  nb10.t.dic.results <- dic.samples( nb10.t.fit, 
    n.iter = n.monitor, thin = 1, type = 'pD' )

)

print( nb10.t.dic.results )

# Mean deviance:  619.4 
# penalty 2.637 
# Penalized deviance: 622

# as before, 'penalty' = pD and 'Penalized deviance' = DIC

# rjags has estimated the number of parameters in this model
# to be 2.637; we know that it's 3, so this is not a terrific
# value of pD

# the theory behind why the pD value in DIC is sometimes inaccurate
# is fairly well understood; i tried a different parameterization
# in the t model that's supposed to force pD to behave better,
# and its value went up to 2.82

# in any case the resulting DIC value of about 622 is (vastly) smaller
# than the 660.2 DIC value for the gaussian model;
# in this case study, DIC has correctly concluded that the
# t model is better than the gaussian

# (12) now let's work on log scores for the two models

# it's hard to find a CRAN package that computes log scores
# for an arbitrary model, so let's do it ourselves 
# to illustrate the process

# we're implementing equation (76) on page 37 of part 11
# of the lecture notes

# get the mcmc draws for ( mu, sigma ) in the gaussian model,
# and the draws for ( mu, sigma, nu ) in the t model

gaussian.mu.star <- nb10.gaussian.results$mu

gaussian.sigma.star <- nb10.gaussian.results$sigma

t.mu.star <- nb10.t.results$mu

t.sigma.star <- nb10.t.results$sigma

t.nu.star <- nb10.t.results$nu

# sort y, to make the log score results more interpretable

y <- sort( y )

# we need the dnorm function to evaluate the gaussian density,
# and we also need a density function for the scaled t distribution:

dt.scaled <- function( y, mu, sigma, nu ) {

  return( exp( lgamma( ( nu + 1 ) / 2 ) - ( ( nu + 1 ) / 2 ) *
    log( 1 + ( y - mu )^2 / ( nu * sigma^2 ) ) -
    lgamma( nu / 2 ) - log( nu * pi ) / 2 - log( sigma ) ) )

}

log.score.contributions <- matrix( NA, n, 2 )

system.time( {

  for ( i in 1:n ) {

    log.score.contributions[ i, 1 ] <- log( mean( dnorm( y[ i ],
      gaussian.mu.star, gaussian.sigma.star ) ) )

    log.score.contributions[ i, 2 ] <- log( mean( dt.scaled( y[ i ],
      t.mu.star, t.sigma.star, t.nu.star ) ) )

  }

} )

#   user  system elapsed 
#   9.19    0.05    9.24

print( log.scores <- apply( log.score.contributions, 2, mean ) )

#      gaussian         t   
# [1] -3.262099 -3.082312

# log scores, as with DIC in this case study, conclude
# that the t model is better than the gaussian (recall that
# large log scores are better than small ones)

# the log score scale is hard to interpret intuitively;
# a natural question is

#   how much higher does the log score for model 2 have to be
#   than the log score for model 1 to declare model 2
#   'significantly' better?

# this question is not easy to answer

# let's look at the individual data point contributions
# to the overall log scores (recall that the data vector
# was sorted from smallest to largest previously)

cbind( y, log.score.contributions, 0 + log.score.contributions[ , 2 ] > 
  log.score.contributions[ , 1 ] )

#                                 t
#                                 better
#                                 than
#          y   gaussian         t G                       

#   [1,] 375 -12.198117 -8.588664 1     
#   [2,] 392  -4.638698 -5.351899 0
#   [3,] 393  -4.362375 -5.079224 0
 
#     .
#     .
#     .

#  [12,] 399  -3.166777 -3.307888 0
#  [13,] 399  -3.166777 -3.307888 0
#  [14,] 400  -3.047119 -3.028850 1
#  [15,] 400  -3.047119 -3.028850 1

#     .
#     .
#     .

#  [81,] 408  -2.935951 -2.875571 1
#  [82,] 408  -2.935951 -2.875571 1
#  [83,] 409  -3.028124 -3.138244 0
#  [84,] 409  -3.028124 -3.138244 0

#     .
#     .
#     .

#  [98,] 418  -4.881877 -5.712829 0
#  [99,] 423  -6.655059 -6.848403 0
# [100,] 437 -13.889084 -9.018675 1

# let's visualize the results in the table

plot( y, log.score.contributions[ , 1 ],
  ylim = c( min( log.score.contributions ),
  max( log.score.contributions ) ),
  ylab = 'Log Score Contributions', col = 'darkblue',
  pch = 16 )

lines( y, log.score.contributions[ , 1 ], lty = 1,
  lwd = 2, col = 'darkblue' )

points( y, log.score.contributions[ , 2 ],
  col = 'darkcyan', pch = 17 )

lines( y, log.score.contributions[ , 2 ], lty = 2, lwd = 2,
  col = 'darkcyan' )

legend( 405, -10, c( 'Gaussian', 't' ), pch = c( 16, 17 ) )

# the t model fits better both in the tails (where the most
# influential observations are from the gaussian point of
# view) and in the center (where most of the data values are)

mean( log.score.contributions[ , 2 ] > 
  log.score.contributions[ , 1 ] )

# [1] 0.71

# the t model fits 71% of the data points better than the gaussian

# note finally that

#   the fact that the log score for the t model is bigger 
#   than the log score for the gaussian does not mean that 
#   the t model fits the data well; it just means that 
#   the t model fits the data better than the gaussian model does

# we need another method to answer the question

#   is the t model good enough to stop looking for a better model?  

############################################################################