#############################################################################

# gibbs sampling example: fitting the gaussian model with both
# mu and sigma unknown (lecture notes part 9, pages 57-71)

# the full-conditional distributions are given on page 57

# the next (driver) function implements gibbs sampling for this model

# i use the conjugate prior (lecture notes part 8, page 135)

# as inputs to this function,

#   y is the data vector
#   nu.0 is the prior sample size for sigma^2
#   sigma2.0 is the prior estimate of sigma^2
#   mu.0 is the prior estimate of mu
#   kappa.0 is the prior sample size for mu
#   mu.initial is the starting value for mu
#   sigma2.initial is the starting value for sigma^2
#   M is the number of monitoring iterations
#   B is the number of burn-in iterations
#   seed is the initialization random number seed

gibbs.sampling.example <- function( y, nu.0, sigma2.0, mu.0, kappa.0, 
  mu.initial, sigma2.initial, M, B, seed ) {

  set.seed( seed )

  n <- length( y )

  y.bar <- mean( y )

  mcmc.data.set <- matrix( NA, M + B + 1, 4 )

  mcmc.data.set[ 1, ] <- c( mu.initial, sigma2.initial,
    sqrt( sigma2.initial ), mu.initial )

  for ( i in 2:( M + B + 1 ) ) {

    mu.star <- rnorm( 1, ( kappa.0 * mu.0 + n * y.bar ) / ( kappa.0 + n ),
      sqrt( mcmc.data.set[ i - 1, 2 ] / ( kappa.0 + n ) ) )

    s2.mu.star <- sum( ( y - mu.star )^2 ) / n

    sigma2.star <- rsichi2( 1, nu.0 + 1 + n, ( nu.0 * sigma2.0 +
      kappa.0 * ( mu.star - mu.0 )^2 + n * s2.mu.star ) /
      ( nu.0 + 1 + n ) )

    y.star <- rnorm( 1, mu.star, sqrt( sigma2.star ) )

    mcmc.data.set[ i, ] <- c( mu.star, sigma2.star,
      sqrt( sigma2.star ), y.star )

    if ( ( i %% 10000 ) == 0 ) print( i )

  }

return( mcmc.data.set )

}

# here's a simple function that generates IID samples from the 
# scaled inverse chi-squared distribution

rsichi2 <- function( n, nu, s2 ) {

  return( nu * s2 / rchisq( n, nu ) )

}

# let's create a small gaussian data set for illustration;
# here n = 10 and the data-generating mean and SD are 50 and 5,
# respectively

set.seed( 3531229 )

print( y <- sort( signif( rnorm( 10, 50, 5 ), 3 ) ) )

# [1] 43.5 48.5 50.5 51.4 52.2 53.2 53.6 53.9 55.3 61.6

mean( y )

# [1] 52.37

sd( y )

# [1] 4.671914

# here i'll choose a rather informative prior for illustration:
# the prior sample sizes for sigma^2 and mu are both 10,
# and the prior estimates differ quite a bit from the data-generating values

nu.0 <- 10

sigma2.0 <- 15

mu.0 <- 90

kappa.0 <- 10

# i'm deliberately starting the markov chain off
# at values far from the posterior means

mu.initial <- 200

sigma2.initial <- 0.1

M <- 100000

B <- 1000

seed <- 123456

system.time( 

  mcmc.data.set.1 <- gibbs.sampling.example( y, nu.0, sigma2.0, mu.0, 
    kappa.0, mu.initial, sigma2.initial, M, B, seed )

)

# [1] 10000
# [1] 20000

#  .
#  .
#  .

# [1] 90000
# [1] 100000

#   user  system elapsed 
#   1.18    0.09    1.28

str( mcmc.data.set.1 )

# num [1:101001, 1:4] 70 72 65.3 84.5 66.8 ...

# the output of the driver function is the mcmc data set,
# which has ( 1 + B + M ) rows and 4 columns: the posterior draws
# for mu, sigma^2, sigma, and a new y (the predictive distribution)

# since this is a conjugate situation, we know what the correct
# posterior distribution is, and we can compare the mcmc output
# with the right answer

print( n <- length( y ) )

# [1] 10

print( nu.n <- nu.0 + n )

# [1] 20

print( y.bar <- mean( y ) )

# [1] 52.37

print( s2 <- var( y ) )

# [1] 21.82678

print( sigma2.n <- ( nu.0 * sigma2.0 + ( n - 1 ) * s2 + kappa.0 * n *
  ( y.bar - mu.0 )^2 / ( kappa.0 + n ) ) / nu.n )

# [1] 371.3263

print( mu.n <- ( kappa.0 * mu.0 + n * y.bar ) / ( kappa.0 + n ) )

# 70.385

print( kappa.n <- kappa.0 + n )

# 20

# here i generate 

sigma2.benchmark <- rsichi2( M, nu.n, sigma2.n )

qqplot( mcmc.data.set.1[ ( 1 + B ):( 1 + B + M ), 2 ], sigma2.benchmark,
  xlab = 'MCMC output for sigma^2', 
  ylab = 'Benchmark output for sigma^2' )

abline( 0, 1, lwd = 2, col = 'blue' )

# you can see that the mcmc output essentially coincides
# with random draws from the correct marginal posterior for sigma^2

# the next function generates IID samples from the scaled t distribution

rscaled.t <- function( n, mu, sigma2, nu ) {

  return( sqrt( sigma2 ) * rt( n, nu ) + mu )

}

mu.benchmark <- rscaled.t( M, mu.n, sigma2.n / kappa.n, nu.n )

qqplot( mcmc.data.set.1[ ( 1 + B ):( 1 + B + M ), 1 ], mu.benchmark,
  xlab = 'MCMC output for mu', ylab = 'Benchmark output for mu' )

abline( 0, 1, lwd = 2, col = 'blue' )

# again the mcmc output essentially coincides with random draws 
# from the correct marginal posterior for mu

y.next.benchmark <- rscaled.t( M, mu.n, ( kappa.n + 1 ) *
  sigma2.n / kappa.n, nu.n )

qqplot( mcmc.data.set.1[ ( 1 + B ):( 1 + B + M ), 4 ], y.next.benchmark,
  xlab = 'MCMC output for y.next', ylab = 'Benchmark output for y.next' )

abline( 0, 1, lwd = 2, col = 'blue' )

# and once again the mcmc output essentially coincides with random draws 
# from the correct marginal posterior predictive for a new y

# let's see what effect the burn-in period had

# recall from above that i started the chain off at ridiculous
# starting values relative to the actual data-generating mu and sigma^2

#   ( mu.initial, sigma2.initial ) = ( 200, 0.1 )   versus
#   ( mu.data.generating, sigma2.data.generating ) = ( 50, 25 )

par( mfrow = c( 2, 1 ) )

plot( 1:50, mcmc.data.set.1[ 1:50, 1 ], type = 'l',
  xlab = 'iteration number', ylab = 'mu draws', lwd = 2, col = 'red',
  ylim = c( 0, 200 ) )

abline( h = mu.n, lwd = 2, lty = 2, col = 'darkcyan' )

abline( v = 2, lwd = 2, lty = 3 )

text( 7.5, 150, 'iteration 2' )

plot( 1:50, mcmc.data.set.1[ 1:50, 2 ], type = 'l',
  xlab = 'iteration number', ylab = 'sigma^2 draws', lwd = 2, col = 'red',
  ylim = c( 0, 1000 ) )

abline( h = sigma2.n, lwd = 2, lty = 2, col = 'darkcyan' )

abline( v = 2, lwd = 2, lty = 3 )

text( 7.5, 800, 'iteration 2' )

par( mfrow = c( 1, 1 ) )

# the chain is already in equilibrium at iteration 2 (!)

# this is why, in simple problems like the ones we've looked at so far,
# a default burn-in of 1,000 iterations (a) is plenty adequate and
# (b) does essentially no harm

##############################################################################