############################################################################ # simple example of metropolis sampling in a case study with k = 1, # namely the problem whose closed-form conjugate solution you found # in quiz 3: # p ( sigma^2 ) proportional to 1 / sigma^2 # ( Y_i | sigma^2 G B ) ~IID N ( mu, sigma^2 ) ( i = 1, ..., n ) # this is based on pages 31-55 in part 8 of the lecture notes # unbuffer the output # change directory to an appropriate place # on my home desktop this is # setwd( 'C:/DD/Teaching/AMS-STAT-206/Winter-2020' ) # here's a driver function for the metropolis sampling: metropolis.example <- function( y, mu, sigma.star, sigma2.0, M, B, seed ) { set.seed( seed ) n <- length( y ) sigma2.hat <- sum( ( y - mu )^2 ) / n lambda.hat <- log( sigma2.hat ) mcmc.data.set <- matrix( NA, M + B + 1, 2 ) mcmc.data.set[ 1, ] <- c( log( sigma2.0 ), sigma2.0 ) acceptance.count <- 0 for ( i in 2:( M + B + 1 ) ) { lambda.current <- mcmc.data.set[ i - 1, 1 ] lambda.star <- rnorm( 1, lambda.current, sigma.star ) u <- runif( 1 ) if ( u <= acceptance.probability( lambda.star, lambda.current, lambda.hat, n ) ) { mcmc.data.set[ i, 1 ] <- lambda.star mcmc.data.set[ i, 2 ] <- exp( lambda.star ) acceptance.count <- acceptance.count + 1 } else { mcmc.data.set[ i, 1 ] <- lambda.current mcmc.data.set[ i, 2 ] <- exp( lambda.current ) } if ( ( i %% 10000 ) == 0 ) print( i ) } print( acceptance.rate <- acceptance.count / ( M + B ) ) return( mcmc.data.set ) } # now the acceptance probability function: acceptance.probability <- function( lambda.star, lambda.current, lambda.hat, n ) { return( exp( log.posterior( lambda.star, lambda.hat, n ) - log.posterior( lambda.current, lambda.hat, n ) ) ) } # and the computation of the log posterior: log.posterior <- function( lambda, lambda.hat, n ) { return( log.prior( lambda ) + log.likelihood( lambda, lambda.hat, n ) ) } # which in turn depends on the log likelihood: log.likelihood <- function( lambda, lambda.hat, n ) { return( - lambda * n / 2 - n * exp( lambda.hat ) / ( 2 * exp( lambda ) ) ) } # and the log prior log.prior <- function( lambda ) { return( 0 ) } # define the NB10 data set y <- c( 409., 400., 406., 399., 402., 406., 401., 403., 401., 403., 398., 403., 407., 402., 401., 399., 400., 401., 405., 402., 408., 399., 399., 402., 399., 397., 407., 401., 399., 401., 403., 400., 410., 401., 407., 423., 406., 406., 402., 405., 405., 409., 399., 402., 407., 406., 413., 409., 404., 402., 404., 406., 407., 405., 411., 410., 410., 410., 401., 402., 404., 405., 392., 407., 406., 404., 403., 408., 404., 407., 412., 406., 409., 400., 408., 404., 401., 404., 408., 406., 408., 406., 401., 412., 393., 437., 418., 415., 404., 401., 401., 407., 412., 375., 409., 406., 398., 406., 403., 404. ) print( mu <- mean( y ) ) # [1] 404.59 print( sigma2.0 <- var( y ) ) # [1] 41.8201 M <- 100000 B <- 500 seed <- 614306 # let's start with a proposal distribution sd (PDSD) of sigma.star = 1 # and see what the acceptance rate is sigma.star <- 1 mcmc.data.set.1 <- metropolis.example( y, mu, sigma.star, sigma2.0, M, B, seed ) # discard the burn-in iterations: mcmc.data.set.1 <- mcmc.data.set.1[ ( B + 1 ):( B + M ), ] # [1] 0.1769851 # only about 18% of the proposed moves were accepted mean( mcmc.data.set.1[ , 2 ] ) # [1] 42.2204 sd( mcmc.data.set.1[ , 2 ] ) # [1] 6.210327 plot( 1:100, mcmc.data.set.1[ 1:100, 2 ], type = 'l', xlab = 'Iteration Number', ylab = 'Draws From p ( sigma^2 | y )', main = 'PDSD = 1.0, acceptance rate = 18%', lwd = 2, col = 'red' ) abline( h = mean( mcmc.data.set.1[ , 2 ] ), lwd = 2, col = 'darkcyan', lty = 2 ) # the acceptance rate with sigma.star = 1 is too low; it has led to # an extremely 'blocky' time series for sigma: the chain makes # relatively large moves *when* it moves, but it doesn't move # as much as we would like it to # here's a function that makes IID draws from the same target posterior rsichi2 <- function( n, nu, s2, seed ) { set.seed( seed ) return( nu * s2 / rchisq( n, nu ) ) } n <- 100 print( sigma2.hat <- ( n - 1 ) * var( y ) / n ) # [1] 41.4019 direct.simulation <- rsichi2( 100000, 100, 41.4019, 8713106 ) n * sigma2.hat / ( n - 2 ) # [1] 42.24684 qqplot( mcmc.data.set.1[ , 2 ], direct.simulation, xlab = 'MCMC Draws', ylab = 'Scaled Inverse Chi Square Draws' ) abline( 0, 1, lwd = 2, col = 'red' ) # you can see that, remarkably, even though the MCMC output # is blocky, with lots of iterations where the Markov chain # stays in the same place, it is producing # identically-distributed draws from the correct posterior # now let's turn the metropolis sampler loose with a value # of sigma.star that's too small: sigma.star <- 0.05 seed <- 3201991 mcmc.data.set.2 <- metropolis.example( y, mu, sigma.star, sigma2.0, M, B, seed ) # [1] 0.8904577 # discard the burn-in iterations: mcmc.data.set.2 <- mcmc.data.set.2[ ( B + 1 ):( B + M ), ] plot( 1:100, mcmc.data.set.2[ 1:100, 2 ], type = 'l', xlab = 'Iteration Number', ylab = 'Draws From p ( sigma^2 | y )', main = 'PDSD = 0.05, acceptance rate = 89%', lwd = 2, col = 'red' ) abline( h = mean( mcmc.data.set.2[ , 2 ] ), lwd = 2, lty = 2, col = 'darkcyan' ) # now, with an 89% acceptance rate, the output is too 'sticky': # the chain moves a lot, but mostly only makes small moves # it's been shown that the optimal acceptance rate when k = 1 # is about 44%, which is achieved when the PDSD is about 2.4 times # the posterior SD 2.4 * sd( mcmc.data.set.1[ , 1 ] ) # [1] 0.3481854 sigma.star <- 0.348 seed <- 8601815 mcmc.data.set.3 <- metropolis.example( y, mu, sigma.star, sigma2.0, M, B, seed ) # [1] 0.4348358 # discard the burn-in iterations: mcmc.data.set.3 <- mcmc.data.set.3[ ( B + 1 ):( B + M ), ] plot( 1:100, mcmc.data.set.3[ 1:100, 2 ], type = 'l', xlab = 'Iteration Number', ylab = 'Draws From p ( sigma^2 | y )', main = 'PDSD = 0.35, acceptance rate = 43%', lwd = 2, col = 'red' ) abline( h = mean( mcmc.data.set.3[ , 2 ] ), lwd = 2, lty = 2, col = 'darkcyan' ) # even the optimal metropolis sampler with the gaussian proposal # distribution has some blocky-ness # let's summarize the posterior for sigma^2 using this mcmc output sigma.squared.star <- mcmc.data.set.3[ , 2 ] n.monitor <- M # first we create what i call the MCMC 4-plot: par( mfrow = c( 2, 2 ) ) plot( 1:n.monitor, sigma.squared.star, type = 'l', xlab = 'Iteration Number', ylab = 'sigma^2' ) # the upper left graph is a time-series plot of the monitored # iterations (to visually check for stationarity) plot( density( sigma.squared.star ), xlab = 'sigma^2', main = '', col = 'red', lwd = 2 ) # the upper right graph is a density trace of the marginal posterior # distribution for the quantity being monitored acf( as.ts( sigma.squared.star[ 1:n.monitor ], start = 1, end = n.monitor, frequency = 1 ), lag.max = 10, main = '' ) pacf( as.ts( sigma.squared.star[ 1:n.monitor ], start = 1, end = n.monitor, frequency = 1 ), lag.max = 10, main = '' ) par( mfrow = c( 1, 1 ) ) print( rho.1.hat.sigma.squared.star <- cor( sigma.squared.star[ 1:( n.monitor - 1 ) ], sigma.squared.star[ 2:n.monitor ] ) ) # 0.6327683 # the lower left and right graphs are plots of the autocorrelation # function (ACF) and the partial autocorrelation function (PACF) # for the time series of monitored iterations; if these iterations # behave like an autoregressive time series of order 1 with first- # order autocorrelation rho (often denoted by AR1( rho )), then # (a) the PACF will have a single spike of height rho at lag 1 and # no other significant spikes and (b) the ACF will exhibit a # geometric decay pattern with a spike of height rho at lag 1, a # spike of approximate height rho^2 at lag 2, and so on # for the parameter sigma.squared, the ACF and PACF plots # look perfectly like an AR1 with a first-order autocorrelation # of about rho.1.hat = 0.63; even with the optimal PDSD, # the autocorrelation of the MCMC output is fairly high # an MCMC time series with a rho.1.hat value close to 0 (which # would be equivalent to IID sampling) is said to be 'mixing well'; # here we would say that the sigma^2 series is mixing fairly well # (11b) then you can make numerical summaries of the marginal # posterior for the monitored quantity: c( mean( sigma.squared.star ), sd( sigma.squared.star ), quantile( sigma.squared.star, probs = c( 0.0005, 0.9995 ) ) ) # 0.05% 99.95% # 42.308414 6.089028 27.062679 68.864544 # we estimate that the posterior mean of sigma^2 is # about 42.3, with a posterior SD of about 6.09, and # a 99.9% bayesian interval for sigma.squared runs from about # 27.1 to about 68.9 # the reason for making the ACF and PACF plots is that if the time # series of monitored iterations for a given quantity behaves like # an AR1( rho ), then the Monte Carlo standard error (MCSE) of the # mean of this series is given by the following expression: print( se.sigma.squared.star.mean <- ( sd( sigma.squared.star ) / sqrt( n.monitor ) ) * sqrt( ( 1 + rho.1.hat.sigma.squared.star ) / ( 1 - rho.1.hat.sigma.squared.star ) ) ) # 0.04060132 # this is also the right order of magnitude for the MCSE of the # MCMC estimate of the posterior SD and the quantiles giving the # 99.9% interval for the monitored quantity # another way to look at the ACF plot is to notice that after about # 10 lags the autocorrelation is essentially 0 # this means that we could extract an essentially IID time series # of output by taking every 10th value in sigma.squared.star; # this is called 'thinning' the output: sigma.squared.star.IID <- sigma.squared.star[ seq( 10, n.monitor, by = 10 ) ] print( n.monitor.IID <- length( sigma.squared.star.IID ) ) # [1] 10000 # let's make an mcmc 4-plot of the thinned series: par( mfrow = c( 2, 2 ) ) plot( 1:n.monitor.IID, sigma.squared.star.IID, type = 'l', xlab = 'Iteration Number', ylab = 'sigma^2' ) plot( density( sigma.squared.star.IID ), xlab = 'sigma^2', main = '', col = 'red', lwd = 2 ) acf( as.ts( sigma.squared.star.IID[ 1:n.monitor.IID ], start = 1, end = n.monitor.IID, frequency = 1 ), lag.max = 10, main = '' ) pacf( as.ts( sigma.squared.star.IID[ 1:n.monitor.IID ], start = 1, end = n.monitor.IID, frequency = 1 ), lag.max = 10, main = '' ) par( mfrow = c( 1, 1 ) ) # white noise! but only 10,000 IID observations, having essentially the same # information content as 100,000 autocorrelated observations; # thinning doesn't add any extra information about sigma^2 (how could it?) ############################################################################