########################################################################### # examining the log likelihood function in the t( mu, sigma, nu ) # sampling model in the NB10 case study # unbuffer the output # set the right working directory # bring in the NB10 data y <- c( 409., 400., 406., 399., 402., 406., 401., 403., 401., 403., 398., 403., 407., 402., 401., 399., 400., 401., 405., 402., 408., 399., 399., 402., 399., 397., 407., 401., 399., 401., 403., 400., 410., 401., 407., 423., 406., 406., 402., 405., 405., 409., 399., 402., 407., 406., 413., 409., 404., 402., 404., 406., 407., 405., 411., 410., 410., 410., 401., 402., 404., 405., 392., 407., 406., 404., 403., 408., 404., 407., 412., 406., 409., 400., 408., 404., 401., 404., 408., 406., 408., 406., 401., 412., 393., 437., 418., 415., 404., 401., 401., 407., 412., 375., 409., 406., 398., 406., 403., 404. ) # math interlude: work out the form of the log-likelihood function # this log likelihood function cannot be maximized # in closed algebraic form, in part because of all of the # Gamma functions involving nu # all of our previous problems, in which the MLE can be expressed # in closed form, are actually the exception, not the rule, # in contemporary data science: in problems of realistic complexity # we need to have good numerical optimization skills # the optimizer i'm going to use below requires the # parameters to be input as a vector instead of one by one, # so i'll define theta = ( mu, sigma, nu ) as the # parameter input log.likelihood.t.sampling.model <- function( theta, y ) { mu <- theta[ 1 ] sigma <- theta[ 2 ] nu <- theta[ 3 ] n <- length( y ) ll <- n * lgamma( ( nu + 1 ) / 2 ) - n * log( sigma ) - n * lgamma( nu / 2 ) - ( n / 2 ) * log( nu ) - ( ( nu + 1 ) / 2 ) * sum( log( 1 + ( ( y - mu ) / sigma )^2 / nu ) ) return( ll ) } # numerical optimization routines need starting values # to begin their iterative searches # i'll start mu out at the sample mean, which should be # not far from the MLE print( mu.0 <- mean( y ) ) # [1] 404.59 # based on the t quantile-quantile plots -- # see the .txt file on the course web page called # R code to explore the family of t sampling distributions # in with the NB10 data set # -- i'll start nu out at a fairly small value print( nu.0 <- 6 ) # the variance of a draw from the scaled t distribution # (gelman et al. appendix A) is ( nu / ( nu - 2 ) ) * sigma^2 print( sigma.0 <- sqrt( ( nu.0 - 2 ) / nu.0 ) * sd( y ) ) # [1] 5.280158 theta.0 <- c( mu.0, sigma.0, nu.0 ) log.likelihood.t.sampling.model( theta.0, y ) # [1] -256.046 exp( -256.046 ) # [1] 6.318807e-112 # the likelihood values with this data set are pretty close to 0, # but R can handle 10^( -112 ); i started by examining # the log likelihood function, but it looks like we can # also work with the likelihood function without worrying # about underflow # one way in R to optimize a function that takes a vector theta # as input and outputs a real number is a built-in R function # called 'optim' help( optim ) # you can see that using this function is non-trivial; # you may need to fiddle with it to get a good result # i'll try a 'vanilla' call to the function first # optim's default is to *minimize* your function; # control = list( fnscale = -1 ) minimizes ( -1 ) times your function, # which of course turns the optimization into a maximization problem numerical.mle.solution <- optim( theta.0, log.likelihood.t.sampling.model, y = y, control = list( fnscale = -1 ), hessian = T ) # Warning messages: # 1: In log(sigma) : NaNs produced # 2: In log(sigma) : NaNs produced # 3: In log(sigma) : NaNs produced # 4: In log(nu) : NaNs produced # 5: In log(1 + ((y - mu)/sigma)^2/nu) : NaNs produced # 6: In log(nu) : NaNs produced # 7: In log(1 + ((y - mu)/sigma)^2/nu) : NaNs produced # 8: In log(nu) : NaNs produced # 9: In log(1 + ((y - mu)/sigma)^2/nu) : NaNs produced # the default optimization method is Nelder-Mead, # which searches outward from the starting values in theta.0 # to look for the maximum; unfortunately in doing so # it can generate negative sigma and/or negative nu values, # which *we* know are impossible, but Nelder-Mead # doesn't know that # so instead let's try # method = 'L-BFGS-B' # this allows us to impose a 'box' around # each parameter by assigning lower and upper values # beyond which the algorithm is not allowed to search # setting the box is tricky, though: i know that i want # the lower values for sigma and nu to be bounded away from 0 # on the positive side of 0, but it's hard to know # what the upper values should be for sigma and nu, # and also to know what to use for the lower and upper values for mu # fortunately you don't need to set upper bounds # if they're not needed; and there's no need to set # a lower bound for mu, so i can set that lower bound to NA (missing) # let's try lower bounds for sigma and nu of the form # (say) epsilon = 0.001 # it's always better to try an unnecessarily large box; # if the box is too small, the 'optimum' may well end up # at the edge of the box, which tells you that the box # you set is too small # however, we may be playing with fire by letting epsilon # be so close to 0; let's just see what happens epsilon <- 0.001 numerical.mle.solution <- optim( theta.0, method = 'L-BFGS-B', log.likelihood.t.sampling.model, y = y, control = list( fnscale = -1 ), hessian = T, lower = c( NA, epsilon, epsilon ) ) # this produced no error message, but it still could be wrong, # if optim has detected a failure to converge str( numerical.mle.solution ) # List of 6 # $ par : num [1:3] 404.28 3.7 3.01 # $ value : num -251 # $ counts : Named int [1:2] 15 15 # ..- attr(*, "names")= chr [1:2] "function" "gradient" # $ convergence: int 0 # $ message : chr "CONVERGENCE: REL_REDUCTION_OF_F <= FACTR*EPSMCH" # $ hessian : num [1:3, 1:3] -4.721 0.185 0.114 0.185 -7.769 ... print( numerical.mle.solution ) # $par # [1] 404.278406 3.700285 3.008206 # $value # [1] -250.8514 # $counts # function gradient # 15 15 # $convergence # [1] 0 # $message # [1] "CONVERGENCE: REL_REDUCTION_OF_F <= FACTR*EPSMCH" # $hessian # [,1] [,2] [,3] # [1,] -4.7208971 0.1850944 0.1138392 # [2,] 0.1850944 -7.7689323 1.9642652 # [3,] 0.1138392 1.9642652 -1.8484565 # both the ( $convergence = 0 ) flag and the message # "CONVERGENCE: REL_REDUCTION_OF_F <= FACTR*EPSMCH" # indicate successful convergence to the MLE # here's the MLE vector: print( theta.hat.mle <- numerical.mle.solution$par ) # mu.hat sigma.hat nu.hat # [1] 404.278406 3.700285 3.008206 print( mu.hat.mle <- theta.hat.mle[ 1 ] ) # [1] 404.2784 print( sigma.hat.mle <- theta.hat.mle[ 2 ] ) # [1] 3.700285 print( nu.hat.mle <- theta.hat.mle[ 3 ] ) # [1] 3.008206 # the generalization of the fisher-information-standard-error # story from k = 1 unknown to k > 1 is that the covariance # matrix of the MLE vector is the inverse of # minus the hessian (matrix of second partial derivatives # of the log likelihood function), evaluated at the MLE: # in R, if A is a square invertible matrix, the function call # solve( A ) # returns the inverse of A print( covariance.matrix.theta.hat.mle <- solve( - numerical.mle.solution$hessian ) ) # mu [,1] sigma [,2] nu [,3] # mu [1,] 0.21288379 0.01146797 0.02529714 # sigma [2,] 0.01146797 0.17662445 0.18839652 # nu [3,] 0.02529714 0.18839652 0.74274971 # to get the correlation matrix of the MLE vector, # it would be nice if R had a function that # computed a correlation matrix from a covariance matrix # a google search reveals that there is such a function, # called 'cov2cor' print( correlation.matrix.theta.hat.mle <- cov2cor( covariance.matrix.theta.hat.mle ) ) # mu [,1] sigma [,2] nu [,3] # [1,] mu 1.00000000 0.05914118 0.06361785 # [2,] sigma 0.05914118 1.00000000 0.52014714 # [3,] nu 0.06361785 0.52014714 1.00000000 # as in the Gaussian sampling model, mu and sigma # are essentially uncorrelated, and mu and nu # also are close to uncorrelated, but there's a non-negligible # correlation of +0.52 between sigma and nu (why?) # the standard errors of the mle components # are the square roots of the diagonal entries # in the covariance matrix print( se.hat.theta.hat.mle <- sqrt( diag( covariance.matrix.theta.hat.mle ) ) ) # [1] 0.4613933 0.4202671 0.8618293 print( se.hat.mu.hat.mle <- se.hat.theta.hat.mle[ 1 ] ) # [1] 0.4613933 print( se.hat.sigma.hat.mle <- se.hat.theta.hat.mle[ 2 ] ) # [1] 0.4202671 print( se.hat.nu.hat.mle <- se.hat.theta.hat.mle[ 3 ] ) # [1] 0.8618293 qnorm( 0.9995 ) # [1] 3.290527 # to get 99.9% approximate (large-n) confidence intervals # for the parameters, as usual we go qnorm( 0.9995 ) =. 3.29 # standard errors either way from the MLEs print( mu.interval <- c( mu.hat.mle - qnorm( 0.9995 ) * se.hat.mu.hat.mle, mu.hat.mle + qnorm( 0.9995 ) * se.hat.mu.hat.mle ) ) # [1] 402.7602 405.7966 print( sigma.interval <- c( sigma.hat.mle - qnorm( 0.9995 ) * se.hat.sigma.hat.mle, sigma.hat.mle + qnorm( 0.9995 ) * se.hat.sigma.hat.mle ) ) # [1] 2.317385 5.083185 print( nu.interval <- c( nu.hat.mle - qnorm( 0.9995 ) * se.hat.nu.hat.mle, nu.hat.mle + qnorm( 0.9995 ) * se.hat.nu.hat.mle ) ) # [1] 0.1723333 5.8440778 # notice how close the 99.9% interval for nu came to going negative, # which of course would be incoherent (logically internally inconsistent) # ------- likelihood analysis -------- # standard 99.9% interval # parameter estimate error lower upper # mu 404.3 0.461 402.7 405.8 # sigma 3.700 0.420 2.317 5.083 # nu 3.008 0.862 0.1723 5.844 ######################################################################## # now let's visualize the log likelihood and likelihood functions # full visualization would require 4 dimensions (3 for # the parameters and one for the (log) likelihood) # let's sneak up on that by fixing one of the three parameters # at its MLE value and making 3-dimensional contour and perspective plots # over the other 2 coordinates n.grid <- 50 mu.grid <- seq( mu.interval[ 1 ], mu.interval[ 2 ], length = n.grid ) sigma.grid <- seq( sigma.interval[ 1 ], sigma.interval[ 2 ], length = n.grid ) nu.grid <- seq( nu.interval[ 1 ], nu.interval[ 2 ], length = n.grid ) # first, let's look at the mu-sigma story, with nu set to nu.hat.mle log.likelihood.mu.sigma.grid <- matrix( NA, n.grid, n.grid ) for ( i in 1:n.grid ) { for ( j in 1:n.grid ) { log.likelihood.mu.sigma.grid[ i, j ] <- log.likelihood.t.sampling.model( c( mu.grid[ i ], sigma.grid[ j ], nu.hat.mle ), y ) } } par( mfrow = c( 2, 2 ) ) contour( mu.grid, sigma.grid, log.likelihood.mu.sigma.grid, xlab = 'mu', ylab = 'sigma', nlevels = 10, col = 'blue', lwd = 2, main = 'log likelihood' ) persp( mu.grid, sigma.grid, log.likelihood.mu.sigma.grid, xlab = 'mu', ylab = 'sigma', theta = 30, phi = 30, zlab = 'log likelihood', col = 'red' ) contour( mu.grid, sigma.grid, exp( log.likelihood.mu.sigma.grid ), xlab = 'mu', ylab = 'sigma', nlevels = 10, col = 'blue', lwd = 2, main = 'likelihood' ) persp( mu.grid, sigma.grid, exp( log.likelihood.mu.sigma.grid ), xlab = 'mu', ylab = 'sigma', theta = 30, phi = 30, zlab = 'likelihood', col = 'red' ) par( mfrow = c( 1, 1 ) ) # along these two dimensions of the 3-dimensional (log) likelihood function, # everything looks nice and unimodal (and not far from bivariate normality) # the next object is a helper function that (a) calls install.packages # and library, if both are needed to get a function from CRAN, # or (b) calls only library if the CRAN function has already # been installed on the machine that's running R in this session dynamic.require <- function( package ) { if ( eval( parse( text = paste( 'require(', package, ')' ) ) ) ) { return( 'done' ) } install.packages( package ) return( eval( parse( text = paste( 'require(', package, ')' ) ) ) ) } dynamic.require( 'plotly' ) # here i'll make the fancier interactive perspective plot # just for the likelihood function likelihood.mu.sigma.grid.t <- t( exp( log.likelihood.mu.sigma.grid ) ) interactive.mu.sigma.likelihood.perspective.plot <- plot_ly( x = mu.grid, y = sigma.grid, z = likelihood.mu.sigma.grid.t ) %>% add_surface( contours = list( z = list( show = T, usecolormap = T, highlightcolor = '#ff0000', project = list( z = T ) ) ) ) %>% layout( title = 'Perspective plot of the bivariate density', scene = list( xaxis = list( title = 'mu' ), yaxis = list( title = 'sigma' ), zaxis = list( title = 'likelihood' ), camera = list( eye = list( x = 1.5, y = 0.5, z = -0.5 ) ) ) ) %>% colorbar( title = 'Bivariate Density' ) interactive.mu.sigma.likelihood.perspective.plot # next, the mu-nu story, with sigma set to sigma.hat.mle log.likelihood.mu.nu.grid <- matrix( NA, n.grid, n.grid ) for ( i in 1:n.grid ) { for ( j in 1:n.grid ) { log.likelihood.mu.nu.grid[ i, j ] <- log.likelihood.t.sampling.model( c( mu.grid[ i ], sigma.hat.mle, nu.grid[ j ] ), y ) } } par( mfrow = c( 2, 2 ) ) contour( mu.grid, nu.grid, log.likelihood.mu.nu.grid, xlab = 'mu', ylab = 'nu', nlevels = 30, col = 'blue', lwd = 2, main = 'log likelihood' ) persp( mu.grid, nu.grid, log.likelihood.mu.sigma.grid, xlab = 'mu', ylab = 'nu', theta = 30, phi = 30, zlab = 'log likelihood', col = 'red' ) contour( mu.grid, nu.grid, exp( log.likelihood.mu.nu.grid ), xlab = 'mu', ylab = 'nu', nlevels = 10, col = 'blue', lwd = 2, main = 'likelihood' ) persp( mu.grid, nu.grid, exp( log.likelihood.mu.nu.grid ), xlab = 'mu', ylab = 'nu', theta = 30, phi = 30, zlab = 'likelihood', col = 'red' ) par( mfrow = c( 1, 1 ) ) # notice how flat the log likelihood surface is in the nu direction; # even with n = 100 observations we still have substantial uncertainty # about nu in this sampling model # even so, again, along *these* two dimensions, everything # looks nice and unimodal (and, again, not far from bivariate normality) likelihood.mu.nu.grid.t <- t( exp( log.likelihood.mu.sigma.grid ) ) interactive.mu.nu.likelihood.perspective.plot <- plot_ly( x = mu.grid, y = nu.grid, z = likelihood.mu.nu.grid.t ) %>% add_surface( contours = list( z = list( show = T, usecolormap = T, highlightcolor = '#ff0000', project = list( z = T ) ) ) ) %>% layout( title = 'Perspective plot of the bivariate density', scene = list( xaxis = list( title = 'mu' ), yaxis = list( title = 'nu' ), zaxis = list( title = 'likelihood' ), camera = list( eye = list( x = 1.5, y = 0.5, z = -0.5 ) ) ) ) %>% colorbar( title = 'Bivariate Density' ) interactive.mu.nu.likelihood.perspective.plot # and finally, the sigma-nu story, with mu set to mu.hat.mle log.likelihood.sigma.nu.grid <- matrix( NA, n.grid, n.grid ) for ( i in 1:n.grid ) { for ( j in 1:n.grid ) { log.likelihood.sigma.nu.grid[ i, j ] <- log.likelihood.t.sampling.model( c( mu.hat.mle, sigma.grid[ i ], nu.grid[ j ] ), y ) } } par( mfrow = c( 2, 2 ) ) contour( sigma.grid, nu.grid, log.likelihood.sigma.nu.grid, xlab = 'sigma', ylab = 'nu', nlevels = 30, col = 'blue', lwd = 2, main = 'log likelihood' ) persp( sigma.grid, nu.grid, log.likelihood.sigma.nu.grid, xlab = 'sigma', ylab = 'nu', theta = 30, phi = 30, zlab = 'log likelihood', col = 'red' ) contour( sigma.grid, nu.grid, exp( log.likelihood.sigma.nu.grid ), xlab = 'sigma', ylab = 'nu', nlevels = 10, col = 'blue', lwd = 2, main = 'likelihood' ) persp( sigma.grid, nu.grid, exp( log.likelihood.sigma.nu.grid ), xlab = 'sigma', ylab = 'nu', theta = 30, phi = 30, zlab = 'likelihood', col = 'red' ) par( mfrow = c( 1, 1 ) ) # here's where something more interesting is happening: # the log likelihood perspective plot in the ( sigma, nu ) # visualization looks *nothing* like a paraboloid, and # the likelihood contour plot shows a long-right-hand-tail # shape in both parameters, especially nu likelihood.sigma.nu.grid.t <- t( exp( log.likelihood.sigma.nu.grid ) ) interactive.sigma.nu.likelihood.perspective.plot <- plot_ly( x = sigma.grid, y = nu.grid, z = likelihood.sigma.nu.grid.t ) %>% add_surface( contours = list( z = list( show = T, usecolormap = T, highlightcolor = '#ff0000', project = list( z = T ) ) ) ) %>% layout( title = 'Perspective plot of the bivariate density', scene = list( xaxis = list( title = 'sigma' ), yaxis = list( title = 'nu' ), zaxis = list( title = 'likelihood' ), camera = list( eye = list( x = 1.5, y = 0.5, z = -0.5 ) ) ) ) %>% colorbar( title = 'Bivariate Density' ) interactive.sigma.nu.likelihood.perspective.plot # in fact, the skewness of the marginal distribution # for nu is so strong that going 3 standard errors either way # from the MLE is not far enough out in the right tail ###################################################################### # now let's do the 4-dimensional visualization # with the help of one of my graduate students (erdong guo), # i developed the following code using the tidyverse function # 'plot_ly' # i'll show you two different graphs, with plotting symbols # of different sizes ##### plot 1 n.grid <- 20 mu.grid <- seq( mu.interval[ 1 ], mu.interval[ 2 ], length = n.grid ) sigma.grid <- seq( sigma.interval[ 1 ], sigma.interval[ 2 ], length = n.grid ) nu.grid <- seq( nu.interval[ 1 ], nu.interval[ 2 ], length = n.grid ) grid.plot <- 20 likelihood.mu.sigma.nu.grid <- rep( NA, dim = grid.plot * grid.plot * grid.plot ) mu.grid.3d.plot <- rep( NA, dim = grid.plot * grid.plot * grid.plot ) sigma.grid.3d.plot <- rep( NA, dim = grid.plot * grid.plot * grid.plot ) nu.grid.3d.plot <- rep( NA, dim = grid.plot * grid.plot * grid.plot ) count = 1 for ( i in 1:grid.plot ) { for ( j in 1:grid.plot ) { for ( k in 1:grid.plot ) { likelihood.mu.sigma.nu.grid[ count ] <- exp( log.likelihood.t.sampling.model( c( mu.grid[ i ], sigma.grid[ j ], nu.grid[ k ] ), y ) ) mu.grid.3d.plot[ count ] <- mu.grid[ i ] sigma.grid.3d.plot[ count ] <- sigma.grid[ j ] nu.grid.3d.plot[ count ] <- nu.grid[ k ] count <- count + 1 } } } interactive.mu.sigma.nu.likelihood.scatter.plot <- plot_ly( type = 'scatter3d', mode = 'markers', x = mu.grid.3d.plot, y = sigma.grid.3d.plot, z = nu.grid.3d.plot, marker = list( color = likelihood.mu.sigma.nu.grid, colorscale = list( c( 0, rgb( 1, 1, 1, 0 ) ), c( 1, '#8B0000FF' ) ), showscale = T ) ) %>% add_markers( ) %>% layout( scene = list( xaxis = list( title = 'mu' ), yaxis = list( title = 'sigma' ), zaxis = list( title = 'nu' ) ) ) interactive.mu.sigma.nu.likelihood.scatter.plot ##### plot 2 interactive.mu.sigma.nu.likelihood.scatter.plot <- plot_ly( type = 'scatter3d', mode = 'markers', x = mu.grid.3d.plot, y = sigma.grid.3d.plot, z = nu.grid.3d.plot, marker = list( color = likelihood.mu.sigma.nu.grid, colorscale = list( c( 0, rgb( 1, 1, 1, 0 ) ), c( 1, '#8B0000FF' ) ), showscale = T, size = 2 ) ) %>% add_markers( ) %>% layout( scene = list( xaxis = list( title = 'mu' ), yaxis = list( title = 'sigma' ), zaxis = list( title = 'nu' ) ) ) interactive.mu.sigma.nu.likelihood.scatter.plot # from both plots you can see that the 3-dimensional likelihood function # has a single mode in the middle of the 3-D box; if the likelihood # had been multi-modal, there would have been two or more distinct # point clouds with substantial likelihood values ############################################################################