########################################################################## # likelihood and bayesian analysis in R to help you with # take-home test 1 problem 2(B) # unbuffer the output # set the right working directory # on the desktop installed in the baskin auditorium this is # setwd( 'C:/Users/user/Desktop/STAT-206-Winter-2020' ) y <- c( 495, 541, 1461, 1555, 1603, 2201, 2750, 3468, 3516, 4319, 6622, 7728, 13159, 21194 ) # from the context of the problem, y is non-negative and can # only take on integer values, but the observed range is from # around 0 to more than 21,000, and the gaps between the # possible values (only 1 in size) are so small in relation # to the spread of the data, that it's vastly preferable # to model our uncertaintly about y (before the data arrived) # with a continuous probability density function (PDF) # we still use the cheating approach to # sampling distribution specification at this stage # of the course, because we haven't looked yet at better options # make a histogram to look at the distributional shape hist( y, prob = T, main = 'Wire Stress Case Study' ) # we want to model y with a continuous density, # because putting a discrete distribution on it # would result in about 20,000 distinct possible values, # which is silly; and besides, the gaps of size 1 # between the possible values of y are tiny in relation # to typical y values (e.g., the median is about 3,000) # the histogram looks roughly like a member of # the family of Exponential distributions print( n <- length( y ) ) # [1] 14 print( s <- sum( y ) ) # [1] 70612 # math interlude: work out the likelihood function # in the Exponential sampling model # you can use either of the following two R functions # to evaluate the likelihood: exponential.likelihood <- function( lambda, s, n ) { log.likelihood <- - n * log( lambda ) - s / lambda return( exp( log.likelihood ) ) } exponential.likelihood <- function( lambda, s, n ) { likelihood <- lambda^( - n ) * exp( - s / lambda ) return( likelihood ) } # both work equally well here, because with our # n and s no overflow or underflow conditions arise # warning: in some versions of this code i showed you guys # earlier, the calling sequence for the likelihood function # was # function( lambda, n, s ) # this will obviously produce different results # if we inadvertently interchange the roles of s and n # math interlude: show that the MLE of lambda is the sample mean print( lambda.hat.mle <- mean( y ) ) # [1] 5043.714 # math interlude: write down the log likelihood function # in this model exponential.log.likelihood <- function( lambda, s, n ) { log.likelihood <- - n * log( lambda ) - s / lambda return( log.likelihood ) } # let's plot the log likelihood for lambda between 2000 # and 12000 lambda.grid <- seq( 2000, 12000, length = 500 ) plot( lambda.grid, exponential.log.likelihood( lambda.grid, s, n ), type = 'l', lwd = 2, col = 'cadetblue4', xlab = 'lambda', ylab = 'log likelihood', main = 'Exponential sampling model' ) # math interlude: show that the inverse cdf (quantile function) # in the Exponential sampling model is # F^( -1 )( p ) = - lambda * log( 1 - p ) exponential.inverse.cdf <- function( lambda, p ) { quantile <- - lambda * log( 1 - p ) return( quantile ) } # let's make the Exponential probability plot # and see if it looks like the line y = x plot( exponential.inverse.cdf( lambda.hat.mle, ( ( 1:n ) - 0.5 ) / n ), sort( y ), xlab = 'Exponential quantiles', ylab = 'y', xlim = c( 0, max( y ) ), type = 'l', lwd = 2, col = 'red' ) abline( 0, 1, lwd = 2, col = 'black', lty = 2 ) ######################################################################### # making the table in take-home test 1 problem 2(B)(v)(b) inverse.gamma.mean.sd <- function( alpha, beta ) { mean <- beta / ( alpha - 1 ) sd <- beta / ( ( alpha - 1 ) * sqrt( alpha - 2 ) ) return( c( mean, sd ) ) } alpha <- 33 / 4 beta <- 32625 print( prior.inverse.gamma.mean.sd <- inverse.gamma.mean.sd( alpha, beta ) ) # [1] 4500 1800 print( likelihood.inverse.gamma.mean.sd <- inverse.gamma.mean.sd( n - 1, s ) ) # [1] 5884.333 1774.193 print( posterior.inverse.gamma.mean.sd <- inverse.gamma.mean.sd( alpha + n, beta + s ) ) # [1] 4858.212 1079.603 # prior likelihood posterior # mean 4500 5884.3 4858.2 posterior mean has to be between prior and lik. mean # sd 1800 1774.2 1079.6 posterior sd has to be smaller than both prior and # lik. sd # get the inverse gamma density and quantile functions from CRAN install.packages( 'invgamma' ) library( invgamma ) # plotting the prior, likelihood and posterior on the same graph lambda.grid <- seq( 1000, 12000, length = 500 ) plot( lambda.grid, dinvgamma( lambda.grid, alpha + n, rate = ( beta + s ) ), type = 'l', lwd = 2, lty = 1, col = 'black', xlab = 'lambda', ylab = 'Density', main = 'THT 1 problem 2(B)(v)(c)' ) lines( lambda.grid, dinvgamma( lambda.grid, n - 1, rate = s ), lwd = 2, lty = 2, col = 'blue' ) lines( lambda.grid, dinvgamma( lambda.grid, alpha, rate = beta ), lwd = 2, lty = 3, col = 'darkcyan' ) text( 7000, 3.5e-04, 'posterior', col = 'black' ) text( 8000, 2.0e-04, 'likelihood', col = 'blue' ) text( 2000, 2.5e-04, 'prior', col = 'darkcyan' ) ########################################################################## # math interlude: show that the fisher information in this model # is n / lambda.hat^2, so that the (repeated-sampling) variance # of the MLE is sqrt( reciprocal( fisher info ) ) = lambda.hat.mle / sqrt( n ) print( se.hat.lambda.hat.mle <- lambda.hat.mle / sqrt( n ) ) # [1] 1347.989 # math interlude: remember that a 100 ( 1 - alpha )% CI for lambda # looks like # lambda.hat.mle +/- Phi^( -1 ) ( 1 - alpha / 2 ) * SE.hat( lambda.hat.mle ) print( z.95 <- qnorm( 1 - 0.05 / 2 ) ) print( ninety.five.percent.interval <- c( lambda.hat.mle - z.95 * se.hat.lambda.hat.mle, lambda.hat.mle + z.95 * se.hat.lambda.hat.mle ) ) # [1] 2401.704 7685.725 print( z.999 <- qnorm( 0.9995 ) ) print( ninety.nine.point.nine.percent.interval <- c( lambda.hat.mle - z.999 * se.hat.lambda.hat.mle, lambda.hat.mle + z.999 * se.hat.lambda.hat.mle ) ) # [1] 608.1193 9479.3093 # math interlude: recall from our earlier work that # the posterior with the conjugate prior is # Gamma^( -1 )( alpha + n, beta + s ) # for 95% posterior interval we want c( qinvgamma( 0.025, alpha + n, rate = ( beta + s ) ), qinvgamma( 0.975, alpha + n, rate = ( beta + s ) ) ) # [1] 3186.027 7381.964 # for 99.9% posterior interval we want c( qinvgamma( 0.0005, alpha + n, rate = ( beta + s ) ), qinvgamma( 0.9995, alpha + n, rate = ( beta + s ) ) ) # [1] 2511.783 10423.188 # method 95% interval 99.9% interval # maximum likelihood 2401.7 7685.7 608.1 9479.3 # bayes with this prior 3186.0 7382.0 2511.8 10423.2 # these are quite different: why? ############################################################################# # the rest of this file covers optional material # that's not part of take-home test 1 # the red curve and the black line (the target) # in the Exponential probability plot above are not far from each other, # which implies that the Exponential is a decent choice # with this data set, but the two largest data points are somewhat # bigger than the Exponential distribution expects; is this evidence # sufficient to make us look for a better fit? # we could answer this question with math, but it's easier # to address with simulation # here's the plan: (1) first i make a 'null' plot that will # eventually contain all of the simulation results: plot( exponential.inverse.cdf( lambda.hat.mle, ( ( 1:n ) - 0.5 ) / n ), sort( y ), col = 'red', xlab = 'Quantiles', ylab = 'sorted y values', main = 'Exponential Sampling Distribution', lwd = 2, xlim = c( 0, max( y ) ), ylim = c( 0, max( y ) ), type = 'n' ) # (2) then i execute a loop that repeatedly (a) generates a data set # randomly drawn from the Exponential distribution with parameter # lambda equal to lambda.hat.mle and (b) superimposes the resulting # Exponential probability plot on the graph # the help( rexp ) page says that the built-in function rexp( n, rate ) # creates n random draws from the distribution with density # f ( y ) = rate * exp( - rate * y ) # this is not the parameterization we're using in the take-home test # problem; to get the right distribution i have to set # rate = 1 / lambda.hat.mle # i have to decide how many simulation replications to use; # i'll start with M = 100 M <- 100 # each simulation replication has to have its own random number seed # (otherwise each replication will repeat the same results # over and over again) for ( m in 1:M ) { set.seed( m ) y.star <- rexp( n, rate = 1 / lambda.hat.mle ) lines( exponential.inverse.cdf( lambda.hat.mle, ( ( 1:n ) - 0.5 ) / n ), sort( y.star ), lwd = 1, col = 'black', lty = 3 ) } abline( 0, 1, lwd = 2, col = 'blue', lty = 2 ) # now i superimpose the *actual* Exponential probability plot # with our data on top of this plot, to see how unusual # our data set is lines( exponential.inverse.cdf( lambda.hat.mle, ( ( 1:n ) - 0.5 ) / n ), sort( y ), lwd = 3, col = 'red' ) # you can see that our data set lies comfortably in the range # of what to expect with Exponential probability plots # of samples of size n = 14 -- in other words, the Exponential # looks like a good sampling model for this case study # if you want you can make M bigger and see what happens; # here i'll use M = 1000 and i'll increase the upper limit # on the y axis to have less of the plot cut off at the top: plot( exponential.inverse.cdf( lambda.hat.mle, ( ( 1:n ) - 0.5 ) / n ), sort( y ), col = 'red', xlab = 'Quantiles', ylab = 'sorted y values', main = 'Exponential Sampling Distribution', lwd = 2, xlim = c( 0, 17000 ), ylim = c( 0, 55000 ), type = 'n' ) M <- 1000 # here's where all of those colors in R called 'grayxx' are helpful, # for creating the uncertainty band without making it go too black for ( m in 1:M ) { set.seed( m ) y.star <- rexp( n, rate = 1 / lambda.hat.mle ) lines( exponential.inverse.cdf( lambda.hat.mle, ( ( 1:n ) - 0.5 ) / n ), sort( y.star ), lwd = 1, col = 'gray70', lty = 3 ) } abline( 0, 1, lwd = 2, col = 'blue', lty = 2 ) lines( exponential.inverse.cdf( lambda.hat.mle, ( ( 1:n ) - 0.5 ) / n ), sort( y ), lwd = 3, col = 'red' ) #############################################################################