######################################################################### # a simple example of monte-carlo (MC) IID sampling # to learn about a distribution (in this case, a member of # the Beta family) # unbuffer the output # set the correct working directory ######################################################################### # this is from part 9 of the lecture notes, pages 6-12 beta.sim <- function( n.monitor, alpha, beta, n.sim, seed ) { set.seed( seed ) theta.out <- matrix( 0, n.sim, 2 ) for ( i in 1:n.sim ) { theta.sample <- rbeta( n.monitor, alpha, beta ) theta.out[ i, 1 ] <- mean( theta.sample ) theta.out[ i, 2 ] <- sqrt( var( theta.sample ) / n.monitor ) } return( theta.out ) } # n.monitor is the number of simulated 'monitoring' draws # going into each posterior mean estimate n.monitor <- 100 # alpha and beta are the posterior Beta parameters alpha <- 76.5 beta <- 353.5 print( posterior.mean <- alpha / ( alpha + beta ) ) # [1] 0.177907 # n.sim is the number of simulation replications of the process # of creating n.monitor IID random draws from the Beta( alpha, beta ) # distribution, taking their mean, and computing the # (monte carlo) standard error of that mean estimate n.sim <- 10000 # seed is the initial random number seed seed <- 6425451 system.time( theta.out <- beta.sim( n.monitor, alpha, beta, n.sim, seed ) ) # user system elapsed # 0.45 0.02 0.47 # on my desktop at home this took less than half a second theta.out[ 1:5, ] # monte carlo # estimate of monte carlo # posterior standard error # mean of mean estimate # [,1] [,2] # [1,] 0.1756400 0.001854220 # [2,] 0.1764806 0.001703780 # [3,] 0.1781742 0.001979863 # [4,] 0.1793588 0.002038532 # [5,] 0.1781556 0.001596011 # [1] 0.177907 is the right answer # let's make a *time series plot* of the first few # of the n.sim replications: n.iterations <- 100 plot( 1:n.iterations, theta.out[ 1:n.iterations, 1 ], type = 'l', xlab = 'iteration number', ylab = 'simulated mean', main = 'IID Monte Carlo Sampling of Posterior Mean', lwd = 2, col = 'red' ) abline( h = 0.1779, lwd = 2, lty = 2, col = 'black' ) # this is what 'white noise' (IID draws) looks like ################################################################################### # let's see how many of the n.sim replications # led to samples with 95% interval estimates # that include the truth alpha.target.hit.rate <- 0.05 hit <- 1 * ( ( theta.out[ , 1 ] - qnorm( 1 - alpha.target.hit.rate / 2 ) * theta.out[ , 2 ] < posterior.mean ) & ( posterior.mean < theta.out[ , 1 ] + qnorm( 1 - alpha.target.hit.rate / 2 ) * theta.out[ , 2 ] ) ) hit[ 1:100 ] # [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 # [48] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 # [95] 1 1 1 1 1 0 # in the first 100 simulation replications, # with the random number seed i'm using, # we got a hit rate of 99% # across all 10,000 replications the hit rate was: mean( hit ) # [1] 0.9469 ################################################################################### # here's a visualization of the first 10 of the intervals, # with 'hit' intervals in red and 'miss' intervals in green n.iterations <- 10 plot( 1:n.iterations, theta.out[ 1:n.iterations, 1 ], type = 'l', xlab = 'iteration number', ylab = 'simulated mean', main = 'IID Monte Carlo Sampling of Posterior Mean', lwd = 2, col = 'red', ylim = c( 0.165, 0.19 ) ) abline( h = posterior.mean, lwd = 2, col = 'black' ) for ( i in 1:n.iterations ) { if ( hit[ i ] == 1 ) { segments( i, theta.out[ i, 1 ] - qnorm( 1 - alpha.target.hit.rate / 2 ) * theta.out[ i, 2 ], i, theta.out[ i, 1 ] + qnorm( 1 - alpha.target.hit.rate / 2 ) * theta.out[ i, 2 ], lwd = 1, col = 'red', lty = 2 ) } else { segments( i, theta.out[ i, 1 ] - qnorm( 1 - alpha.target.hit.rate / 2 ) * theta.out[ i, 2 ], i, theta.out[ i, 1 ] + qnorm( 1 - alpha.target.hit.rate / 2 ) * theta.out[ i, 2 ], lwd = 1, col = 'darkcyan', lty = 2 ) } } ##################################################################################### # now the first 20: n.iterations <- 20 plot( 1:n.iterations, theta.out[ 1:n.iterations, 1 ], type = 'l', xlab = 'iteration number', ylab = 'simulated mean', main = 'IID Monte Carlo Sampling of Posterior Mean', lwd = 2, col = 'red', ylim = c( 0.17, 0.185 ) ) abline( h = posterior.mean, lwd = 2, col = 'black' ) for ( i in 1:n.iterations ) { if ( hit[ i ] == 1 ) { segments( i, theta.out[ i, 1 ] - qnorm( 1 - alpha.target.hit.rate / 2 ) * theta.out[ i, 2 ], i, theta.out[ i, 1 ] + qnorm( 1 - alpha.target.hit.rate / 2 ) * theta.out[ i, 2 ], lwd = 1, col = 'red', lty = 2 ) } else { segments( i, theta.out[ i, 1 ] - qnorm( 1 - alpha.target.hit.rate / 2 ) * theta.out[ i, 2 ], i, theta.out[ i, 1 ] + qnorm( 1 - alpha.target.hit.rate / 2 ) * theta.out[ i, 2 ], lwd = 1, col = 'darkcyan', lty = 2 ) } } ######################################################################################## # and the first 100 n.iterations <- 100 plot( 1:n.iterations, theta.out[ 1:n.iterations, 1 ], type = 'l', xlab = 'iteration number', ylab = 'simulated mean', main = 'IID Monte Carlo Sampling of Posterior Mean', lwd = 2, col = 'red', ylim = c( 0.17, 0.185 ) ) abline( h = posterior.mean, lwd = 2, col = 'black' ) for ( i in 1:n.iterations ) { if ( hit[ i ] == 1 ) { segments( i, theta.out[ i, 1 ] - qnorm( 1 - alpha.target.hit.rate / 2 ) * theta.out[ i, 2 ], i, theta.out[ i, 1 ] + qnorm( 1 - alpha.target.hit.rate / 2 ) * theta.out[ i, 2 ], lwd = 1, col = 'red', lty = 2 ) } else { segments( i, theta.out[ i, 1 ] - qnorm( 1 - alpha.target.hit.rate / 2 ) * theta.out[ i, 2 ], i, theta.out[ i, 1 ] + qnorm( 1 - alpha.target.hit.rate / 2 ) * theta.out[ i, 2 ], lwd = 2, col = 'darkcyan', lty = 2 ) } } ############################################################################################ # and finally the first 1000 n.iterations <- 1000 plot( 1:n.iterations, theta.out[ 1:n.iterations, 1 ], type = 'l', xlab = 'iteration number', ylab = 'simulated mean', main = 'IID Monte Carlo Sampling of Posterior Mean', lwd = 2, col = 'red', ylim = c( 0.165, 0.19 ) ) abline( h = posterior.mean, lwd = 2, col = 'black' ) for ( i in 1:n.iterations ) { if ( hit[ i ] == 1 ) { segments( i, theta.out[ i, 1 ] - qnorm( 1 - alpha.target.hit.rate / 2 ) * theta.out[ i, 2 ], i, theta.out[ i, 1 ] + qnorm( 1 - alpha.target.hit.rate / 2 ) * theta.out[ i, 2 ], lwd = 1, col = 'red', lty = 2 ) } else { segments( i, theta.out[ i, 1 ] - qnorm( 1 - alpha.target.hit.rate / 2 ) * theta.out[ i, 2 ], i, theta.out[ i, 1 ] + qnorm( 1 - alpha.target.hit.rate / 2 ) * theta.out[ i, 2 ], lwd = 2, col = 'darkcyan', lty = 2 ) } } ######################################################################### # in day-to-day work you would typically only take a single sample # from the distribution of interest to you, or you might take # a 'pilot' sample to get an initial idea of accuracy, # from which you could compute the required sample size # to achieve a reasonable monte carlo accuracy goal # let's start with a sample of size 500 # (this is unrealistically low, for illustration purposes) n.monitor.pilot <- 500 theta.star.pilot <- rbeta( n.monitor.pilot, alpha, beta ) print( mean.pilot <- mean( theta.star.pilot ) ) # [1] 0.1793063 print( sd.pilot <- sd( theta.star.pilot ) ) # [1] 0.01880652 # the pilot monte carlo standard error (MCSE) of # the mean estimate (by the usual frequentist calculation) # is sd.pilot / sqrt( n.monitor.pilot ) print( mcse.mean.pilot <- sd.pilot / sqrt( n.monitor.pilot ) ) # [1] 0.0008410533 # suppose that we want to be able to report # the monte carlo mean estimate to three significant figures # (at the moment that would be 0.179) # then we could conservatively aim for an MCSE of epsilon = 0.0001 # (1 tick in the fourth significant figure) # solving the little equation # epsilon = sd / sqrt( n ) # for n gives # n = ( sd / epsilon )^2 epsilon <- 0.0001 print( n.target <- ( sd.pilot / epsilon )^2 ) # [1] 35368.54 # this suggest that our actual monitoring run # should be of length roughly 35,000 , # which i might round up to 40,000 for good luck n.monitor <- 40000 theta.star <- rbeta( n.monitor, alpha, beta ) print( mc.mean.estimate <- mean( theta.star ) ) # [1] 0.1778335 print( mc.sd.estimate <- sd( theta.star ) ) # [1] 0.01845791 print( mcse.mean.estimate <- mc.sd.estimate / sqrt( n.monitor ) ) # [1] 9.228955e-05 # so, on the basis of this monitoring run, # we would report a monte carlo posterior mean estimate # of 0.178, or even 0.1778, with a (monte carlo standard error) give or take # of about 0.0001; the right answer is posterior.mean # [1] 0.177907 ##################################################################################### # these days, simulation is so fast that many people # just start with a pilot sample size of 100,000 n.monitor <- 100000 c( n.monitor, alpha, beta ) # [1] 100000.0 76.5 353.5 theta.star <- rbeta( n.monitor, alpha, beta ) # the following picture, which could be called a 'monte carlo 4-plot', # summarizes the n.monitor draws from the target distribution par( mfrow = c( 2, 2 ) ) plot( 1:n.monitor, theta.star, type = 'l', xlab = 'Iteration Number', ylab = 'theta' ) # the upper left graph is a time-series plot of the monitored # iterations (to visually check for stationarity) plot( density( theta.star ), xlab = 'theta', main = '', lwd = 2, col = 'red' ) # the upper right graph is a density trace of the # density for the quantity being monitored acf( as.ts( theta.star[ 1:n.monitor ], start = 1, end = n.monitor, frequency = 1 ), lag.max = 10, main = '' ) # the lower left graph is a plot of the autocorrelation function (ACF) # of the monitored draws from the target distribution # (this will be explained in the discussion section) pacf( as.ts( theta.star[ 1:n.monitor ], start = 1, end = n.monitor, frequency = 1 ), lag.max = 10, main = '' ) # and the lower right graph is a plot of the # partial autocorrelation function (PACF) # of the monitored draws from the target distribution # (this will be explained in the discussion section) par( mfrow = c( 1, 1 ) ) # the ACF for an IID time series has a spike at lag 0 # of height 1, and all other spikes are negligibly different from 0 # the PACF for an IID times series has all of its spikes # negligibly different from 0 # these behaviors match our simulation findings # *markov chain* monte carlo results will be different (more soon) ################################################################################################ # code simulating a random walk (lecture notes part 9 page 26) rw.sim <- function( k, p, theta.start, n.sim, seed ) { set.seed( seed ) theta <- rep( 0, n.sim + 1 ) theta[ 1 ] <- theta.start for ( i in 1:n.sim ) { theta[ i + 1 ] <- move( k, p, theta[ i ] ) } print( table( theta ) ) return( theta ) } move <- function( k, p, theta ) { repeat { increment <- sample( x = c( -1, 0, 1 ), size = 1, prob = p ) theta.next <- theta + increment if ( abs( theta.next ) <= k ) { return( theta.next ) break } } } p <- c( 1, 1, 1 ) / 3 k <- 5 theta.start <- 0 seed <- 9626954 theta.star <- rw.sim( k, p, theta.start, 10, seed ) # theta # -2 -1 0 # 3 4 4 theta.star <- rw.sim( k, p, theta.start, 100, seed ) # theta # -5 -4 -3 -2 -1 0 1 2 3 # 15 22 18 11 11 8 2 9 5 theta.star <- rw.sim( k, p, theta.start, 1000, seed ) # theta # -5 -4 -3 -2 -1 0 1 2 3 4 5 # 100 113 87 65 66 68 83 95 130 124 70 theta.star <- rw.sim( k, p, theta.start, 100000, seed ) # theta # -5 -4 -3 -2 -1 0 1 2 3 4 5 # 6663 10111 10201 9864 9841 9587 9380 9305 9535 9337 6177 n.monitor <- 100001 par( mfrow = c( 2, 2 ) ) plot( 1:n.monitor, theta.star, type = 'l', xlab = 'Iteration Number', ylab = 'theta' ) # the upper left graph is a time-series plot of the monitored # iterations (to visually check for stationarity) plot( density( theta.star ), xlab = 'theta', main = '', lwd = 2, col = 'red' ) # the upper right graph is a density trace of the # density for the quantity being monitored acf( as.ts( theta.star[ 1:n.monitor ], start = 1, end = n.monitor, frequency = 1 ), lag.max = 10, main = '' ) # the lower left graph is a plot of the autocorrelation function (ACF) # of the monitored draws from the target distribution # (this will be explained in the discussion section) pacf( as.ts( theta.star[ 1:n.monitor ], start = 1, end = n.monitor, frequency = 1 ), lag.max = 10, main = '' ) # and the lower right graph is a plot of the # partial autocorrelation function (PACF) # of the monitored draws from the target distribution # (this will be explained in the discussion section) par( mfrow = c( 1, 1 ) ) ########################################################################################