###########################################################################

# examining the log likelihood function in the t( mu, sigma, nu )
# sampling model in the NB10 case study

# unbuffer the output

# set the right working directory

# bring in the NB10 data

y <- c( 409., 400., 406., 399., 402., 406., 401., 403., 401., 403.,
        398., 403., 407., 402., 401., 399., 400., 401., 405., 402., 
        408., 399., 399., 402., 399., 397., 407., 401., 399., 401., 
        403., 400., 410., 401., 407., 423., 406., 406., 402., 405., 
        405., 409., 399., 402., 407., 406., 413., 409., 404., 402., 
        404., 406., 407., 405., 411., 410., 410., 410., 401., 402., 
        404., 405., 392., 407., 406., 404., 403., 408., 404., 407., 
        412., 406., 409., 400., 408., 404., 401., 404., 408., 406., 
        408., 406., 401., 412., 393., 437., 418., 415., 404., 401., 
        401., 407., 412., 375., 409., 406., 398., 406., 403., 404. 
)

# math interlude: work out the form of the log-likelihood function

# this log likelihood function cannot be maximized
# in closed algebraic form, in part because of all of the
# Gamma functions involving nu

# all of our previous problems, in which the MLE can be expressed
# in closed form, are actually the exception, not the rule,
# in contemporary data science: in problems of realistic complexity
# we need to have good numerical optimization skills

# the optimizer i'm going to use below requires the
# parameters to be input as a vector instead of one by one,
# so i'll define theta = ( mu, sigma, nu ) as the
# parameter input

log.likelihood.t.sampling.model <- function( theta, y ) {

  mu <- theta[ 1 ]

  sigma <- theta[ 2 ]

  nu <- theta[ 3 ]

  n <- length( y )

  ll <- n * lgamma( ( nu + 1 ) / 2 ) - n * log( sigma ) - 
    n * lgamma( nu / 2 ) - ( n / 2 ) * log( nu ) -
    ( ( nu + 1 ) / 2 ) * sum( log( 1 + ( ( y - mu ) / sigma )^2 / nu ) )

  return( ll )

}

# numerical optimization routines need starting values
# to begin their iterative searches

# i'll start mu out at the sample mean, which should be
# not far from the MLE

print( mu.0 <- mean( y ) )

# [1] 404.59

# based on the t quantile-quantile plots --
# see the .txt file on the course web page called

#   R code to explore the family of t sampling distributions
#   in with the NB10 data set

# -- i'll start nu out at a fairly small value

print( nu.0 <- 6 )

# the variance of a draw from the scaled t distribution
# (gelman et al. appendix A) is ( nu / ( nu - 2 ) ) * sigma^2

print( sigma.0 <- sqrt( ( nu.0 - 2 ) / nu.0 ) * sd( y ) )

# [1] 5.280158

theta.0 <- c( mu.0, sigma.0, nu.0 )

log.likelihood.t.sampling.model( theta.0, y )

# [1] -256.046

exp( -256.046 )

# [1] 6.318807e-112

# the likelihood values with this data set are pretty close to 0,
# but R can handle 10^( -112 ); i started by examining
# the log likelihood function, but it looks like we can
# also work with the likelihood function without worrying
# about underflow

# one way in R to optimize a function that takes a vector theta
# as input and outputs a real number is a built-in R function
# called 'optim'

help( optim )

# you can see that using this function is non-trivial;
# you may need to fiddle with it to get a good result

# i'll try a 'vanilla' call to the function first

# optim's default is to *minimize* your function;
# control = list( fnscale = -1 ) minimizes ( -1 ) times your function,
# which of course turns the optimization into a maximization problem

numerical.mle.solution <- optim( theta.0,
  log.likelihood.t.sampling.model, y = y, 
  control = list( fnscale = -1 ), hessian = T )

# Warning messages:

# 1: In log(sigma) : NaNs produced
# 2: In log(sigma) : NaNs produced
# 3: In log(sigma) : NaNs produced
# 4: In log(nu) : NaNs produced
# 5: In log(1 + ((y - mu)/sigma)^2/nu) : NaNs produced
# 6: In log(nu) : NaNs produced
# 7: In log(1 + ((y - mu)/sigma)^2/nu) : NaNs produced
# 8: In log(nu) : NaNs produced
# 9: In log(1 + ((y - mu)/sigma)^2/nu) : NaNs produced

# the default optimization method is Nelder-Mead,
# which searches outward from the starting values in theta.0
# to look for the maximum; unfortunately in doing so
# it can generate negative sigma and/or negative nu values,
# which *we* know are impossible, but Nelder-Mead
# doesn't know that

# so instead let's try 

#   method = 'L-BFGS-B' 

# this allows us to impose a 'box' around
# each parameter by assigning lower and upper values
# beyond which the algorithm is not allowed to search

# setting the box is tricky, though: i know that i want
# the lower values for sigma and nu to be bounded away from 0
# on the positive side of 0, but it's hard to know 
# what the upper values should be for sigma and nu, 
# and also to know what to use for the lower and upper values for mu

# fortunately you don't need to set upper bounds
# if they're not needed; and there's no need to set
# a lower bound for mu, so i can set that lower bound to NA (missing)

# let's try lower bounds for sigma and nu of the form
# (say) epsilon = 0.001

# it's always better to try an unnecessarily large box;
# if the box is too small, the 'optimum' may well end up
# at the edge of the box, which tells you that the box
# you set is too small

# however, we may be playing with fire by letting epsilon
# be so close to 0; let's just see what happens

epsilon <- 0.001

numerical.mle.solution <- optim( theta.0, method = 'L-BFGS-B',
  log.likelihood.t.sampling.model, y = y, control = list( fnscale = -1 ),
  hessian = T, lower = c( NA, epsilon, epsilon ) )

# this produced no error message, but it still could be wrong,
# if optim has detected a failure to converge

str( numerical.mle.solution )

# List of 6
#  $ par        : num [1:3] 404.28 3.7 3.01
#  $ value      : num -251
#  $ counts     : Named int [1:2] 15 15
#   ..- attr(*, "names")= chr [1:2] "function" "gradient"
#  $ convergence: int 0
#  $ message    : chr "CONVERGENCE: REL_REDUCTION_OF_F <= FACTR*EPSMCH"
#  $ hessian    : num [1:3, 1:3] -4.721 0.185 0.114 0.185 -7.769 ...

print( numerical.mle.solution )

# $par
# [1] 404.278406   3.700285   3.008206

# $value
# [1] -250.8514

# $counts
# function gradient 
#       15       15 

# $convergence
# [1] 0

# $message
# [1] "CONVERGENCE: REL_REDUCTION_OF_F <= FACTR*EPSMCH"

# $hessian
#            [,1]       [,2]       [,3]
# [1,] -4.7208971  0.1850944  0.1138392
# [2,]  0.1850944 -7.7689323  1.9642652
# [3,]  0.1138392  1.9642652 -1.8484565

# both the ( $convergence = 0 ) flag and the message
# "CONVERGENCE: REL_REDUCTION_OF_F <= FACTR*EPSMCH"
# indicate successful convergence to the MLE

# here's the MLE vector:

print( theta.hat.mle <- numerical.mle.solution$par )

#         mu.hat  sigma.hat     nu.hat
# [1] 404.278406   3.700285   3.008206

print( mu.hat.mle <- theta.hat.mle[ 1 ] )

# [1] 404.2784

print( sigma.hat.mle <- theta.hat.mle[ 2 ] )

# [1] 3.700285

print( nu.hat.mle <- theta.hat.mle[ 3 ] )

# [1] 3.008206

# the generalization of the fisher-information-standard-error
# story from k = 1 unknown to k > 1 is that the covariance
# matrix of the MLE vector is the inverse of 

#   minus the hessian (matrix of second partial derivatives
#   of the log likelihood function), evaluated at the MLE:

# in R, if A is a square invertible matrix, the function call

#   solve( A )

# returns the inverse of A

print( covariance.matrix.theta.hat.mle <- 
  solve( - numerical.mle.solution$hessian ) )

#               mu [,1] sigma [,2]    nu [,3]
# mu    [1,] 0.21288379 0.01146797 0.02529714
# sigma [2,] 0.01146797 0.17662445 0.18839652
# nu    [3,] 0.02529714 0.18839652 0.74274971

# to get the correlation matrix of the MLE vector,
# it would be nice if R had a function that
# computed a correlation matrix from a covariance matrix

# a google search reveals that there is such a function,
# called 'cov2cor'

print( correlation.matrix.theta.hat.mle <- 
  cov2cor( covariance.matrix.theta.hat.mle ) )

#               mu [,1] sigma [,2]    nu [,3]
# [1,] mu    1.00000000 0.05914118 0.06361785
# [2,] sigma 0.05914118 1.00000000 0.52014714
# [3,] nu    0.06361785 0.52014714 1.00000000

# as in the Gaussian sampling model, mu and sigma
# are essentially uncorrelated, and mu and nu
# also are close to uncorrelated, but there's a non-negligible
# correlation of +0.52 between sigma and nu (why?)

# the standard errors of the mle components
# are the square roots of the diagonal entries
# in the covariance matrix

print( se.hat.theta.hat.mle <- 
  sqrt( diag( covariance.matrix.theta.hat.mle ) ) )

# [1] 0.4613933 0.4202671 0.8618293

print( se.hat.mu.hat.mle <- se.hat.theta.hat.mle[ 1 ] )

# [1] 0.4613933

print( se.hat.sigma.hat.mle <- se.hat.theta.hat.mle[ 2 ] )

# [1] 0.4202671

print( se.hat.nu.hat.mle <- se.hat.theta.hat.mle[ 3 ] )

# [1] 0.8618293

qnorm( 0.9995 )

# [1] 3.290527

# to get 99.9% approximate (large-n) confidence intervals 
# for the parameters, as usual we go qnorm( 0.9995 ) =. 3.29 
# standard errors either way from the MLEs

print( mu.interval <- c( mu.hat.mle - qnorm( 0.9995 ) * 
  se.hat.mu.hat.mle, mu.hat.mle + qnorm( 0.9995 ) * 
  se.hat.mu.hat.mle ) )

# [1] 402.7602 405.7966

print( sigma.interval <- c( sigma.hat.mle - qnorm( 0.9995 ) * 
  se.hat.sigma.hat.mle, sigma.hat.mle + qnorm( 0.9995 ) * 
  se.hat.sigma.hat.mle ) )

# [1] 2.317385 5.083185

print( nu.interval <- c( nu.hat.mle - qnorm( 0.9995 ) * 
  se.hat.nu.hat.mle, nu.hat.mle + qnorm( 0.9995 ) * 
  se.hat.nu.hat.mle ) )

# [1] 0.1723333 5.8440778

# notice how close the 99.9% interval for nu came to going negative,
# which of course would be incoherent (logically internally inconsistent)

#            ------- likelihood analysis --------

#                     standard   99.9% interval
# parameter  estimate   error    lower    upper

#    mu       404.3     0.461   402.7     405.8
#  sigma        3.700   0.420     2.317     5.083
#    nu         3.008   0.862     0.1723    5.844

########################################################################

# now let's visualize the log likelihood and likelihood functions

# full visualization would require 4 dimensions (3 for
# the parameters and one for the (log) likelihood)

# let's sneak up on that by fixing one of the three parameters
# at its MLE value and making 3-dimensional contour and perspective plots
# over the other 2 coordinates

n.grid <- 50

mu.grid <- seq( mu.interval[ 1 ], mu.interval[ 2 ], length = n.grid )

sigma.grid <- seq( sigma.interval[ 1 ], sigma.interval[ 2 ], 
  length = n.grid )

nu.grid <- seq( nu.interval[ 1 ], nu.interval[ 2 ], length = n.grid )

# first, let's look at the mu-sigma story, with nu set to nu.hat.mle

log.likelihood.mu.sigma.grid <- matrix( NA, n.grid, n.grid )

for ( i in 1:n.grid ) {

  for ( j in 1:n.grid ) {

    log.likelihood.mu.sigma.grid[ i, j ] <- 
      log.likelihood.t.sampling.model( c( mu.grid[ i ],
      sigma.grid[ j ], nu.hat.mle ), y )

  }

}

par( mfrow = c( 2, 2 ) )

contour( mu.grid, sigma.grid, log.likelihood.mu.sigma.grid,
  xlab = 'mu', ylab = 'sigma', nlevels = 10, col = 'blue',
  lwd = 2, main = 'log likelihood' )

persp( mu.grid, sigma.grid, log.likelihood.mu.sigma.grid,
  xlab = 'mu', ylab = 'sigma', theta = 30, phi = 30,
  zlab = 'log likelihood', col = 'red' )

contour( mu.grid, sigma.grid, exp( log.likelihood.mu.sigma.grid ),
  xlab = 'mu', ylab = 'sigma', nlevels = 10, col = 'blue',
  lwd = 2, main = 'likelihood' )

persp( mu.grid, sigma.grid, exp( log.likelihood.mu.sigma.grid ),
  xlab = 'mu', ylab = 'sigma', theta = 30, phi = 30,
  zlab = 'likelihood', col = 'red' )

par( mfrow = c( 1, 1 ) )

# along these two dimensions of the 3-dimensional (log) likelihood function,
# everything looks nice and unimodal (and not far from bivariate normality)

# the next object is a helper function that (a) calls install.packages
# and library, if both are needed to get a function from CRAN,
# or (b) calls only library if the CRAN function has already
# been installed on the machine that's running R in this session

dynamic.require <- function( package ) {

  if ( eval( parse( text = paste( 'require(', package, ')' ) ) ) ) {

    return( 'done' )

  }

  install.packages( package )

  return( eval( parse( text = paste( 'require(', package, ')' ) ) ) )

}

dynamic.require( 'plotly' )

# here i'll make the fancier interactive perspective plot
# just for the likelihood function

likelihood.mu.sigma.grid.t <- t( exp( log.likelihood.mu.sigma.grid ) )

interactive.mu.sigma.likelihood.perspective.plot <- 
  plot_ly( x = mu.grid, y = sigma.grid, 
  z = likelihood.mu.sigma.grid.t ) %>% 
  add_surface( contours = list( z = list( show = T, usecolormap = T,
  highlightcolor = '#ff0000', project = list( z = T ) ) ) ) %>%
  layout( title = 'Perspective plot of the bivariate density',
  scene = list( xaxis = list( title = 'mu' ), 
  yaxis = list( title = 'sigma' ), zaxis = list( title = 'likelihood' ),
  camera = list( eye = list( x = 1.5, y = 0.5, z = -0.5 ) ) ) ) %>% 
  colorbar( title = 'Bivariate Density' )

interactive.mu.sigma.likelihood.perspective.plot

# next, the mu-nu story, with sigma set to sigma.hat.mle

log.likelihood.mu.nu.grid <- matrix( NA, n.grid, n.grid )

for ( i in 1:n.grid ) {

  for ( j in 1:n.grid ) {

    log.likelihood.mu.nu.grid[ i, j ] <- 
      log.likelihood.t.sampling.model( c( mu.grid[ i ],
      sigma.hat.mle, nu.grid[ j ] ), y )

  }

}

par( mfrow = c( 2, 2 ) )

contour( mu.grid, nu.grid, log.likelihood.mu.nu.grid,
  xlab = 'mu', ylab = 'nu', nlevels = 30, col = 'blue',
  lwd = 2, main = 'log likelihood' )

persp( mu.grid, nu.grid, log.likelihood.mu.sigma.grid,
  xlab = 'mu', ylab = 'nu', theta = 30, phi = 30,
  zlab = 'log likelihood', col = 'red' )

contour( mu.grid, nu.grid, exp( log.likelihood.mu.nu.grid ),
  xlab = 'mu', ylab = 'nu', nlevels = 10, col = 'blue',
  lwd = 2, main = 'likelihood' )

persp( mu.grid, nu.grid, exp( log.likelihood.mu.nu.grid ),
  xlab = 'mu', ylab = 'nu', theta = 30, phi = 30,
  zlab = 'likelihood', col = 'red' )

par( mfrow = c( 1, 1 ) )

# notice how flat the log likelihood surface is in the nu direction;
# even with n = 100 observations we still have substantial uncertainty
# about nu in this sampling model

# even so, again, along *these* two dimensions, everything
# looks nice and unimodal (and, again, not far from bivariate normality)

likelihood.mu.nu.grid.t <- t( exp( log.likelihood.mu.sigma.grid ) )

interactive.mu.nu.likelihood.perspective.plot <- 
  plot_ly( x = mu.grid, y = nu.grid, 
  z = likelihood.mu.nu.grid.t ) %>% 
  add_surface( contours = list( z = list( show = T, usecolormap = T,
  highlightcolor = '#ff0000', project = list( z = T ) ) ) ) %>%
  layout( title = 'Perspective plot of the bivariate density',
  scene = list( xaxis = list( title = 'mu' ), 
  yaxis = list( title = 'nu' ), zaxis = list( title = 'likelihood' ),
  camera = list( eye = list( x = 1.5, y = 0.5, z = -0.5 ) ) ) ) %>% 
  colorbar( title = 'Bivariate Density' )

interactive.mu.nu.likelihood.perspective.plot

# and finally, the sigma-nu story, with mu set to mu.hat.mle

log.likelihood.sigma.nu.grid <- matrix( NA, n.grid, n.grid )

for ( i in 1:n.grid ) {

  for ( j in 1:n.grid ) {

    log.likelihood.sigma.nu.grid[ i, j ] <- 
      log.likelihood.t.sampling.model( c( mu.hat.mle, 
      sigma.grid[ i ], nu.grid[ j ] ), y )

  }

}

par( mfrow = c( 2, 2 ) )

contour( sigma.grid, nu.grid, log.likelihood.sigma.nu.grid,
  xlab = 'sigma', ylab = 'nu', nlevels = 30, col = 'blue',
  lwd = 2, main = 'log likelihood' )

persp( sigma.grid, nu.grid, log.likelihood.sigma.nu.grid,
  xlab = 'sigma', ylab = 'nu', theta = 30, phi = 30,
  zlab = 'log likelihood', col = 'red' )

contour( sigma.grid, nu.grid, exp( log.likelihood.sigma.nu.grid ),
  xlab = 'sigma', ylab = 'nu', nlevels = 10, col = 'blue',
  lwd = 2, main = 'likelihood' )

persp( sigma.grid, nu.grid, exp( log.likelihood.sigma.nu.grid ),
  xlab = 'sigma', ylab = 'nu', theta = 30, phi = 30,
  zlab = 'likelihood', col = 'red' )

par( mfrow = c( 1, 1 ) )

# here's where something more interesting is happening:
# the log likelihood perspective plot in the ( sigma, nu )
# visualization looks *nothing* like a paraboloid, and
# the likelihood contour plot shows a long-right-hand-tail
# shape in both parameters, especially nu

likelihood.sigma.nu.grid.t <- t( exp( log.likelihood.sigma.nu.grid ) )

interactive.sigma.nu.likelihood.perspective.plot <- 
  plot_ly( x = sigma.grid, y = nu.grid, 
  z = likelihood.sigma.nu.grid.t ) %>% 
  add_surface( contours = list( z = list( show = T, usecolormap = T,
  highlightcolor = '#ff0000', project = list( z = T ) ) ) ) %>%
  layout( title = 'Perspective plot of the bivariate density',
  scene = list( xaxis = list( title = 'sigma' ), 
  yaxis = list( title = 'nu' ), zaxis = list( title = 'likelihood' ),
  camera = list( eye = list( x = 1.5, y = 0.5, z = -0.5 ) ) ) ) %>% 
  colorbar( title = 'Bivariate Density' )

interactive.sigma.nu.likelihood.perspective.plot

# in fact, the skewness of the marginal distribution
# for nu is so strong that going 3 standard errors either way
# from the MLE is not far enough out in the right tail

######################################################################

# now let's do the 4-dimensional visualization

# with the help of one of my graduate students (erdong guo),
# i developed the following code using the tidyverse function
# 'plot_ly'

# i'll show you two different graphs, with plotting symbols
# of different sizes

##### plot 1

n.grid <- 20

mu.grid <- seq( mu.interval[ 1 ], mu.interval[ 2 ], length = n.grid )

sigma.grid <- seq( sigma.interval[ 1 ], sigma.interval[ 2 ], 
  length = n.grid )

nu.grid <- seq( nu.interval[ 1 ], nu.interval[ 2 ], length = n.grid )

grid.plot <- 20

likelihood.mu.sigma.nu.grid <- rep( NA, 
  dim = grid.plot * grid.plot * grid.plot )
 
mu.grid.3d.plot <- rep( NA, 
  dim = grid.plot * grid.plot * grid.plot )

sigma.grid.3d.plot <- rep( NA, 
  dim = grid.plot * grid.plot * grid.plot )

nu.grid.3d.plot <- rep( NA, 
  dim = grid.plot * grid.plot * grid.plot )

count = 1

for ( i in 1:grid.plot ) {

  for ( j in 1:grid.plot ) {

    for ( k in 1:grid.plot ) {

      likelihood.mu.sigma.nu.grid[ count ] <- exp( 
        log.likelihood.t.sampling.model( c( mu.grid[ i ],
        sigma.grid[ j ], nu.grid[ k ] ), y ) )

      mu.grid.3d.plot[ count ] <- mu.grid[ i ]

      sigma.grid.3d.plot[ count ] <- sigma.grid[ j ]

      nu.grid.3d.plot[ count ] <- nu.grid[ k ]

      count <- count + 1     

    }

  }

}

interactive.mu.sigma.nu.likelihood.scatter.plot <- 
  plot_ly( type = 'scatter3d', mode = 'markers', 
  x = mu.grid.3d.plot, y = sigma.grid.3d.plot, 
  z = nu.grid.3d.plot, marker = list( color = likelihood.mu.sigma.nu.grid, 
  colorscale = list( c( 0, rgb( 1, 1, 1, 0 ) ), c( 1, '#8B0000FF' ) ), 
  showscale = T ) ) %>% 
  add_markers( ) %>% layout( scene = 
  list( xaxis = list( title = 'mu' ), yaxis = list( title = 'sigma' ),
  zaxis = list( title = 'nu' ) ) ) 

interactive.mu.sigma.nu.likelihood.scatter.plot

##### plot 2

interactive.mu.sigma.nu.likelihood.scatter.plot <- 
  plot_ly( type = 'scatter3d', mode = 'markers', 
  x = mu.grid.3d.plot, y = sigma.grid.3d.plot, 
  z = nu.grid.3d.plot, marker = list( color = likelihood.mu.sigma.nu.grid, 
  colorscale = list( c( 0, rgb( 1, 1, 1, 0 ) ), c( 1, '#8B0000FF' ) ), 
  showscale = T, size = 2 ) ) %>% 
  add_markers( ) %>% layout( scene = 
  list( xaxis = list( title = 'mu' ), yaxis = list( title = 'sigma' ),
  zaxis = list( title = 'nu' ) ) ) 

interactive.mu.sigma.nu.likelihood.scatter.plot

# from both plots you can see that the 3-dimensional likelihood function
# has a single mode in the middle of the 3-D box; if the likelihood
# had been multi-modal, there would have been two or more distinct
# point clouds with substantial likelihood values

############################################################################