############################################################################

# simple example of metropolis sampling in a case study with k = 1,
# namely the problem whose closed-form conjugate solution you found
# in quiz 3: 

#          p ( sigma^2 )  proportional to 1 / sigma^2
#   ( Y_i | sigma^2 G B ) ~IID N ( mu, sigma^2 )   ( i = 1, ..., n )

# this is based on pages 31-55 in part 8 of the lecture notes

# unbuffer the output

# change directory to an appropriate place

# on my home desktop this is

#   setwd( 'C:/DD/Teaching/AMS-STAT-206/Winter-2020' )

# here's a driver function for the metropolis sampling:

metropolis.example <- function( y, mu, sigma.star, sigma2.0, M, B, seed ) {

  set.seed( seed )

  n <- length( y )

  sigma2.hat <- sum( ( y - mu )^2 ) / n

  lambda.hat <- log( sigma2.hat )

  mcmc.data.set <- matrix( NA, M + B + 1, 2 )

  mcmc.data.set[ 1, ] <- c( log( sigma2.0 ), sigma2.0 )

  acceptance.count <- 0

  for ( i in 2:( M + B + 1 ) ) {

    lambda.current <- mcmc.data.set[ i - 1, 1 ]

    lambda.star <- rnorm( 1, lambda.current, sigma.star )

    u <- runif( 1 )

    if ( u <= acceptance.probability( lambda.star, lambda.current,
      lambda.hat, n ) ) {

      mcmc.data.set[ i, 1 ] <- lambda.star

      mcmc.data.set[ i, 2 ] <- exp( lambda.star )

      acceptance.count <- acceptance.count + 1

    }

    else {

      mcmc.data.set[ i, 1 ] <- lambda.current

      mcmc.data.set[ i, 2 ] <- exp( lambda.current )

    }

    if ( ( i %% 10000 ) == 0 ) print( i )

  }

  print( acceptance.rate <- acceptance.count / ( M + B ) )

  return( mcmc.data.set )

}

# now the acceptance probability function:

acceptance.probability <- function( lambda.star, lambda.current,
  lambda.hat, n ) {

  return( exp( log.posterior( lambda.star, lambda.hat, n ) -
    log.posterior( lambda.current, lambda.hat, n ) ) )

}

# and the computation of the log posterior:

log.posterior <- function( lambda, lambda.hat, n ) {

  return( log.prior( lambda ) + log.likelihood( lambda, lambda.hat,
    n ) )

}

# which in turn depends on the log likelihood:

log.likelihood <- function( lambda, lambda.hat, n ) {

  return( - lambda * n / 2 - n * exp( lambda.hat ) /
    ( 2 * exp( lambda ) ) )

}

# and the log prior

log.prior <- function( lambda ) {

  return( 0 )

}

# define the NB10 data set

y <- c( 409., 400., 406., 399., 402., 406., 401., 403., 401., 403., 398.,
  403., 407., 402., 401., 399., 400., 401., 405., 402., 408., 399., 399.,
  402., 399., 397., 407., 401., 399., 401., 403., 400., 410., 401., 407.,
  423., 406., 406., 402., 405., 405., 409., 399., 402., 407., 406., 413.,
  409., 404., 402., 404., 406., 407., 405., 411., 410., 410., 410., 401.,
  402., 404., 405., 392., 407., 406., 404., 403., 408., 404., 407., 412.,
  406., 409., 400., 408., 404., 401., 404., 408., 406., 408., 406., 401.,
  412., 393., 437., 418., 415., 404., 401., 401., 407., 412., 375., 409.,
  406., 398., 406., 403., 404. )

print( mu <- mean( y ) )

# [1] 404.59

print( sigma2.0 <- var( y ) )

# [1] 41.8201

M <- 100000

B <- 500

seed <- 614306

# let's start with a proposal distribution sd (PDSD) of sigma.star = 1
# and see what the acceptance rate is

sigma.star <- 1

mcmc.data.set.1 <- metropolis.example( y, mu, sigma.star, sigma2.0, 
  M, B, seed )

# discard the burn-in iterations:

mcmc.data.set.1 <- mcmc.data.set.1[ ( B + 1 ):( B + M ), ]

# [1] 0.1769851

# only about 18% of the proposed moves were accepted

mean( mcmc.data.set.1[ , 2 ] )

# [1] 42.2204

sd( mcmc.data.set.1[ , 2 ] )

# [1] 6.210327

plot( 1:100, mcmc.data.set.1[ 1:100, 2 ], type = 'l',
  xlab = 'Iteration Number', ylab = 'Draws From p ( sigma^2 | y )',
  main = 'PDSD = 1.0, acceptance rate = 18%', lwd = 2, col = 'red' )

abline( h = mean( mcmc.data.set.1[ , 2 ] ), lwd = 2, 
  col = 'darkcyan', lty = 2 )

# the acceptance rate with sigma.star = 1 is too low; it has led to
# an extremely 'blocky' time series for sigma: the chain makes
# relatively large moves *when* it moves, but it doesn't move
# as much as we would like it to

# here's a function that makes IID draws from the same target posterior

rsichi2 <- function( n, nu, s2, seed ) {

  set.seed( seed )

  return( nu * s2 / rchisq( n, nu ) )

}

n <- 100

print( sigma2.hat <- ( n - 1 ) * var( y ) / n )

# [1] 41.4019

direct.simulation <- rsichi2( 100000, 100, 41.4019, 8713106 )

n * sigma2.hat / ( n - 2 )

# [1] 42.24684

qqplot( mcmc.data.set.1[ , 2 ], direct.simulation, xlab = 'MCMC Draws',
  ylab = 'Scaled Inverse Chi Square Draws' )

abline( 0, 1, lwd = 2, col = 'red' )

# you can see that, remarkably, even though the MCMC output
# is blocky, with lots of iterations where the Markov chain
# stays in the same place, it is producing
# identically-distributed draws from the correct posterior

# now let's turn the metropolis sampler loose with a value
# of sigma.star that's too small:

sigma.star <- 0.05

seed <- 3201991

mcmc.data.set.2 <- metropolis.example( y, mu, sigma.star, sigma2.0, 
  M, B, seed )

# [1] 0.8904577

# discard the burn-in iterations:

mcmc.data.set.2 <- mcmc.data.set.2[ ( B + 1 ):( B + M ), ]

plot( 1:100, mcmc.data.set.2[ 1:100, 2 ], type = 'l',
  xlab = 'Iteration Number', ylab = 'Draws From p ( sigma^2 | y )',
  main = 'PDSD = 0.05, acceptance rate = 89%', lwd = 2, col = 'red' )

abline( h = mean( mcmc.data.set.2[ , 2 ] ), lwd = 2, lty = 2,
  col = 'darkcyan' )

# now, with an 89% acceptance rate, the output is too 'sticky':
# the chain moves a lot, but mostly only makes small moves

# it's been shown that the optimal acceptance rate when k = 1
# is about 44%, which is achieved when the PDSD is about 2.4 times
# the posterior SD

2.4 * sd( mcmc.data.set.1[ , 1 ] )

# [1] 0.3481854

sigma.star <- 0.348

seed <- 8601815

mcmc.data.set.3 <- metropolis.example( y, mu, sigma.star, sigma2.0, 
  M, B, seed )

# [1] 0.4348358

# discard the burn-in iterations:

mcmc.data.set.3 <- mcmc.data.set.3[ ( B + 1 ):( B + M ), ]

plot( 1:100, mcmc.data.set.3[ 1:100, 2 ], type = 'l',
  xlab = 'Iteration Number', ylab = 'Draws From p ( sigma^2 | y )',
  main = 'PDSD = 0.35, acceptance rate = 43%', lwd = 2, col = 'red' )

abline( h = mean( mcmc.data.set.3[ , 2 ] ), lwd = 2, lty = 2,
  col = 'darkcyan' )

# even the optimal metropolis sampler with the gaussian proposal
# distribution has some blocky-ness

# let's summarize the posterior for sigma^2 using this mcmc output

sigma.squared.star <- mcmc.data.set.3[ , 2 ]

n.monitor <- M

# first we create what i call the MCMC 4-plot:

par( mfrow = c( 2, 2 ) )

plot( 1:n.monitor, sigma.squared.star, type = 'l', 
  xlab = 'Iteration Number', ylab = 'sigma^2' )

# the upper left graph is a time-series plot of the monitored
# iterations (to visually check for stationarity)

plot( density( sigma.squared.star ), xlab = 'sigma^2', main = '',
  col = 'red', lwd = 2 )

# the upper right graph is a density trace of the marginal posterior
# distribution for the quantity being monitored

acf( as.ts( sigma.squared.star[ 1:n.monitor ], start = 1, 
  end = n.monitor, frequency = 1 ), lag.max = 10, main = '' )

pacf( as.ts( sigma.squared.star[ 1:n.monitor ], start = 1, 
  end = n.monitor, frequency = 1 ), lag.max = 10, main = '' )

par( mfrow = c( 1, 1 ) )

print( rho.1.hat.sigma.squared.star <- 
  cor( sigma.squared.star[ 1:( n.monitor - 1 ) ], 
  sigma.squared.star[ 2:n.monitor ] ) )

# 0.6327683

# the lower left and right graphs are plots of the autocorrelation 
# function (ACF) and the partial autocorrelation function (PACF) 
# for the time series of monitored iterations; if these iterations
# behave like an autoregressive time series of order 1 with first-
# order autocorrelation rho (often denoted by AR1( rho )), then
# (a) the PACF will have a single spike of height rho at lag 1 and
# no other significant spikes and (b) the ACF will exhibit a
# geometric decay pattern with a spike of height rho at lag 1, a
# spike of approximate height rho^2 at lag 2, and so on

# for the parameter sigma.squared, the ACF and PACF plots 
# look perfectly like an AR1 with a first-order autocorrelation 
# of about rho.1.hat = 0.63; even with the optimal PDSD,
# the autocorrelation of the MCMC output is fairly high

# an MCMC time series with a rho.1.hat value close to 0 (which
# would be equivalent to IID sampling) is said to be 'mixing well';
# here we would say that the sigma^2 series is mixing fairly well

# (11b) then you can make numerical summaries of the marginal 
#       posterior for the monitored quantity:

c( mean( sigma.squared.star ), sd( sigma.squared.star ), 
  quantile( sigma.squared.star, probs = c( 0.0005, 0.9995 ) ) )

#                         0.05%    99.95% 
# 42.308414  6.089028 27.062679 68.864544

# we estimate that the posterior mean of sigma^2 is
# about 42.3, with a posterior SD of about 6.09, and
# a 99.9% bayesian interval for sigma.squared runs from about
# 27.1 to about 68.9

# the reason for making the ACF and PACF plots is that if the time
# series of monitored iterations for a given quantity behaves like
# an AR1( rho ), then the Monte Carlo standard error (MCSE) of the 
# mean of this series is given by the following expression:

print( se.sigma.squared.star.mean <- ( sd( sigma.squared.star ) / 
  sqrt( n.monitor ) ) * sqrt( ( 1 + rho.1.hat.sigma.squared.star ) / 
  ( 1 - rho.1.hat.sigma.squared.star ) ) )

# 0.04060132

# this is also the right order of magnitude for the MCSE of the
# MCMC estimate of the posterior SD and the quantiles giving the
# 99.9% interval for the monitored quantity

# another way to look at the ACF plot is to notice that after about
# 10 lags the autocorrelation is essentially 0

# this means that we could extract an essentially IID time series
# of output by taking every 10th value in sigma.squared.star;
# this is called 'thinning' the output:

sigma.squared.star.IID <- sigma.squared.star[ seq( 10, n.monitor,
  by = 10 ) ]

print( n.monitor.IID <- length( sigma.squared.star.IID ) )

# [1] 10000

# let's make an mcmc 4-plot of the thinned series:

par( mfrow = c( 2, 2 ) )

plot( 1:n.monitor.IID, sigma.squared.star.IID, type = 'l', 
  xlab = 'Iteration Number', ylab = 'sigma^2' )

plot( density( sigma.squared.star.IID ), xlab = 'sigma^2', main = '',
  col = 'red', lwd = 2 )

acf( as.ts( sigma.squared.star.IID[ 1:n.monitor.IID ], start = 1, 
  end = n.monitor.IID, frequency = 1 ), lag.max = 10, main = '' )

pacf( as.ts( sigma.squared.star.IID[ 1:n.monitor.IID ], start = 1, 
  end = n.monitor.IID, frequency = 1 ), lag.max = 10, main = '' )

par( mfrow = c( 1, 1 ) )

# white noise! but only 10,000 IID observations, having essentially the same
# information content as 100,000 autocorrelated observations;
# thinning doesn't add any extra information about sigma^2 (how could it?)

############################################################################