############################################################################################

# keeping track of red-or-not at roulette

# unbuffer the output

# setwd( 'C:/DD/Teaching/AMS-STAT-206/Winter-2020' )

print( population.data.set <- c( rep( 0, 20 ), rep( 1, 18 ) ) )

#  [1] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 
# [31] 1 1 1 1 1 1 1 1

# plot the population PMF

plot( 0, 0, type = 'n', xlim = c( -0.5, 1.5 ),
  xlab = '1 if red, 0 if not', main = 'Population PMF: red or not',
  ylab = 'PMF value', ylim = c( 0, 0.55 ) )

segments( 0, 0, 0, 20 / 38, lwd = 2, col = 'red' )

segments( 1, 0, 1, 18 / 38, lwd = 2, col = 'red' )

# population mean is theta

print( theta <- mean( population.data.set ) )

# [1] 0.4736842

# population SD is sigma = sqrt( theta * ( 1 - theta ) )

print( sigma <- sqrt( mean( ( population.data.set - theta )^2 ) ) )

# [1] 0.499307

sqrt( theta * ( 1 - theta ) )

# [1] 0.499307

# create the repeated-sampling data set

# sample size is n

n <- 10

M <- 100000

random.number.generator.seed <- 4391557

set.seed( random.number.generator.seed )

repeated.sampling.data.set <- rep( NA, M )

system.time( {

  for ( m in 1:M ) {

    simulated.sample.data.set <- sample( population.data.set, n, 
      replace = T )

    repeated.sampling.data.set[ m ] <- mean( simulated.sample.data.set )

  }

} 

)

#   user  system elapsed 
#   1.15    0.00    1.16

# on my home desktop, with a single thread at 3.4GHz,
# M = 100,000 simulation replications took about 1 sec
# of clock time

M <- 1000000

random.number.generator.seed <- 4391557

set.seed( random.number.generator.seed )

repeated.sampling.data.set <- rep( NA, M )

system.time( {

  for ( m in 1:M ) {

    simulated.sample.data.set <- sample( population.data.set, n, 
      replace = T )

    repeated.sampling.data.set[ m ] <- mean( simulated.sample.data.set )

  }

} 

)

#   user  system elapsed 
#   11.20    0.02   11.22

# on my home desktop, with a single thread at 3.4GHz,
# M = 1,000,000 simulation replications took about 11 sec,
# almost exactly 10 times longer 

# the repeated-sampling data set contains M IID simulated
# replications of theta.hat = sample mean

# the (long-run) mean of the theta hat values
# in the repeated-sampling data set is our
# simulation estimate of the expected value
# of the random variable theta hat, which we know
# is theta = 0.4736842

print( expected.value.of.theta.hat <- 
  mean( repeated.sampling.data.set ) )

# [1] 0.4739401

# so with M = 1,000,000 simulation replications,
# the simulation noise is in the fourth digit
# to the right of the decimal point

# the (long-run) SD of the theta hat values
# in the repeated-sampling data set is our
# simulation estimate of the standard error
# of the random variable theta hat, which we know
# is sqrt( theta * ( 1 - theta ) / n )

sqrt( theta * ( 1 - theta ) / n )

# [1] 0.1578947

print( standard.error.of.theta.hat <- 
  sd( repeated.sampling.data.set ) )

# [1] 0.1579019

# again, with M = 1,000,000 simulation replications,
# the simulation noise is in the fourth digit
# to the right of the decimal point

# let's get a relative frequency distribution
# of the simulated theta.hat values

table( repeated.sampling.data.set ) / M

# repeated.sampling.data.set

#        0      0.1      0.2      0.3      0.4      0.5      0.6  
# 0.001627 0.014571 0.059352 0.142520 0.224295 0.243032 0.182159  
#      0.7      0.8      0.9       1 
# 0.093761 0.031771 0.0063430.000569

# so with n only 10, there's a lot of uncertainty
# about what theta is on the basis of theta.hat:
# the most likely theta.hat values are 0.4 and 0.5,
# but 0.3 and 0.6 also have substantial probability

hist( repeated.sampling.data.set, prob = T, main = 'n = 10',
  breaks = 100, xlab = 'theta.hat' )

# the repeated-sampling (frequentist) distribution of theta.hat
# is still strongly discrete (PMF) with n only 10,
# but it alreadly looks like a discrete version of the normal curve
# (this is of course the Central Limit Theorem (CLT) in action)

theta.hat.grid <- seq( 0, 1, length = 500 )

lines( theta.hat.grid, dnorm( theta.hat.grid, expected.value.of.theta.hat,
  standard.error.of.theta.hat ), lwd = 2, col = 'red' )

# it doesn't look like the normal approximation to
# the PMF of theta.hat is very good with n = 10,
# but if i force the PMF to look more continuous
# by using fewer histogram bars ...

hist( repeated.sampling.data.set, prob = T, main = 'n = 10',
  breaks = 12, xlab = 'theta.hat' )

lines( theta.hat.grid, dnorm( theta.hat.grid, expected.value.of.theta.hat,
  standard.error.of.theta.hat ), lwd = 2, col = 'red' )

# ... you can see that it's already surprisingly good,
# at least for some purposes

# let's compute the simulation approximation to

#   P ( | theta.hat - theta | <= epsilon )

# for a small epsilon such as 0.01; we want this probability to be big

epsilon <- 0.01

mean( abs( repeated.sampling.data.set - theta ) <= epsilon ) 

# [1] 0

# with n only 10, this accuracy goal cannot yet be met

# the corresponding normal approximation is

pnorm( epsilon / standard.error.of.theta.hat ) - 
  pnorm( - epsilon / standard.error.of.theta.hat )

# [1] 0.05049665

###############################################################################

# let's crank n up to 100 and have a look:

n <- 100

random.number.generator.seed <- 6350823

set.seed( random.number.generator.seed )

repeated.sampling.data.set <- rep( NA, M )

system.time( {

  for ( m in 1:M ) {

    simulated.sample.data.set <- sample( population.data.set, n, 
      replace = T )

    repeated.sampling.data.set[ m ] <- mean( simulated.sample.data.set )

  }

} 

)

#   user  system elapsed 
#  13.03    0.00   13.03

# interestingly, multiplying n by 10 has little effect on the clock time

print( expected.value.of.theta.hat <- 
  mean( repeated.sampling.data.set ) )

# [1] 0.4737013

# the right answer is still 0.4736842,
# so with M = 1,000,000 simulation replications,
# the simulation noise is again in the fourth digit
# to the right of the decimal point

sqrt( theta * ( 1 - theta ) / n )

# [1] 0.0499307

print( standard.error.of.theta.hat <- 
  sd( repeated.sampling.data.set ) )

# [1] 0.04993065

# by lucky random sampling the simulation noise
# is almost 0 for the standard error with this
# random number seed

table( repeated.sampling.data.set ) / M

# repeated.sampling.data.set

#     0.23     0.24     0.25     0.26     0.27     0.28     0.29      0.3     0.31     0.32 
# 0.000001 0.000001 0.000003 0.000005 0.000017 0.000043 0.000074 0.000169 0.000355 0.000684 

#     0.33     0.34     0.35     0.36     0.37     0.38     0.39      0.4     0.41     0.42 
# 0.001166 0.002217 0.003585 0.005850 0.009286 0.013584 0.020008 0.027216 0.035270 0.045308 

#     0.43     0.44     0.45     0.46     0.47     0.48     0.49      0.5     0.51     0.52 
# 0.054952 0.063598 0.071134 0.076732 0.079489 0.079018 0.075618 0.069454 0.061233 0.051764 

#     0.53     0.54     0.55     0.56     0.57     0.58     0.59      0.6     0.61     0.62 
# 0.042328 0.033097 0.025022 0.017996 0.012468 0.008450 0.005274 0.003389 0.001900 0.001083 

#     0.63     0.64     0.65     0.66     0.67     0.68     0.69      0.7     0.71     0.72 
# 0.000574 0.000314 0.000145 0.000061 0.000036 0.000018 0.000005 0.000003 0.000002 0.000001

hist( repeated.sampling.data.set, prob = T, main = 'n = 100',
  breaks = 1000, xlab = 'theta.hat' )

# the repeated-sampling (frequentist) distribution of theta.hat
# is still strongly discrete (PMF) with n only 10,
# but it alreadly looks like a discrete version of the normal curve
# (this is of course the Central Limit Theorem (CLT) in action)

lines( theta.hat.grid, dnorm( theta.hat.grid, expected.value.of.theta.hat,
  standard.error.of.theta.hat ), lwd = 2, col = 'red' )

hist( repeated.sampling.data.set, prob = T, main = 'n = 100',
  breaks = 50, xlab = 'theta.hat' )

# the repeated-sampling (frequentist) distribution of theta.hat
# is still strongly discrete (PMF) with n only 10,
# but it alreadly looks like a discrete version of the normal curve
# (this is of course the Central Limit Theorem (CLT) in action)

lines( theta.hat.grid, dnorm( theta.hat.grid, expected.value.of.theta.hat,
  standard.error.of.theta.hat ), lwd = 2, col = 'red' )

mean( abs( repeated.sampling.data.set - theta ) <= epsilon ) 

# [1] 0.158507

# with n of 100, this accuracy goal is still too stringent

# the corresponding normal approximation is

pnorm( epsilon / standard.error.of.theta.hat ) - 
  pnorm( - epsilon / standard.error.of.theta.hat )

# [1] 0.1587367

# the normal approximation is now superb

# how large would n need to be for 

#   P ( | theta.hat - theta | <= epsilon ) >= ( 1 - gamma )

# for some small gamma such as 0.05?