############################################################################################ # keeping track of red-or-not at roulette # unbuffer the output # setwd( 'C:/DD/Teaching/AMS-STAT-206/Winter-2020' ) print( population.data.set <- c( rep( 0, 20 ), rep( 1, 18 ) ) ) # [1] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 # [31] 1 1 1 1 1 1 1 1 # plot the population PMF plot( 0, 0, type = 'n', xlim = c( -0.5, 1.5 ), xlab = '1 if red, 0 if not', main = 'Population PMF: red or not', ylab = 'PMF value', ylim = c( 0, 0.55 ) ) segments( 0, 0, 0, 20 / 38, lwd = 2, col = 'red' ) segments( 1, 0, 1, 18 / 38, lwd = 2, col = 'red' ) # population mean is theta print( theta <- mean( population.data.set ) ) # [1] 0.4736842 # population SD is sigma = sqrt( theta * ( 1 - theta ) ) print( sigma <- sqrt( mean( ( population.data.set - theta )^2 ) ) ) # [1] 0.499307 sqrt( theta * ( 1 - theta ) ) # [1] 0.499307 # create the repeated-sampling data set # sample size is n n <- 10 M <- 100000 random.number.generator.seed <- 4391557 set.seed( random.number.generator.seed ) repeated.sampling.data.set <- rep( NA, M ) system.time( { for ( m in 1:M ) { simulated.sample.data.set <- sample( population.data.set, n, replace = T ) repeated.sampling.data.set[ m ] <- mean( simulated.sample.data.set ) } } ) # user system elapsed # 1.15 0.00 1.16 # on my home desktop, with a single thread at 3.4GHz, # M = 100,000 simulation replications took about 1 sec # of clock time M <- 1000000 random.number.generator.seed <- 4391557 set.seed( random.number.generator.seed ) repeated.sampling.data.set <- rep( NA, M ) system.time( { for ( m in 1:M ) { simulated.sample.data.set <- sample( population.data.set, n, replace = T ) repeated.sampling.data.set[ m ] <- mean( simulated.sample.data.set ) } } ) # user system elapsed # 11.20 0.02 11.22 # on my home desktop, with a single thread at 3.4GHz, # M = 1,000,000 simulation replications took about 11 sec, # almost exactly 10 times longer # the repeated-sampling data set contains M IID simulated # replications of theta.hat = sample mean # the (long-run) mean of the theta hat values # in the repeated-sampling data set is our # simulation estimate of the expected value # of the random variable theta hat, which we know # is theta = 0.4736842 print( expected.value.of.theta.hat <- mean( repeated.sampling.data.set ) ) # [1] 0.4739401 # so with M = 1,000,000 simulation replications, # the simulation noise is in the fourth digit # to the right of the decimal point # the (long-run) SD of the theta hat values # in the repeated-sampling data set is our # simulation estimate of the standard error # of the random variable theta hat, which we know # is sqrt( theta * ( 1 - theta ) / n ) sqrt( theta * ( 1 - theta ) / n ) # [1] 0.1578947 print( standard.error.of.theta.hat <- sd( repeated.sampling.data.set ) ) # [1] 0.1579019 # again, with M = 1,000,000 simulation replications, # the simulation noise is in the fourth digit # to the right of the decimal point # let's get a relative frequency distribution # of the simulated theta.hat values table( repeated.sampling.data.set ) / M # repeated.sampling.data.set # 0 0.1 0.2 0.3 0.4 0.5 0.6 # 0.001627 0.014571 0.059352 0.142520 0.224295 0.243032 0.182159 # 0.7 0.8 0.9 1 # 0.093761 0.031771 0.0063430.000569 # so with n only 10, there's a lot of uncertainty # about what theta is on the basis of theta.hat: # the most likely theta.hat values are 0.4 and 0.5, # but 0.3 and 0.6 also have substantial probability hist( repeated.sampling.data.set, prob = T, main = 'n = 10', breaks = 100, xlab = 'theta.hat' ) # the repeated-sampling (frequentist) distribution of theta.hat # is still strongly discrete (PMF) with n only 10, # but it alreadly looks like a discrete version of the normal curve # (this is of course the Central Limit Theorem (CLT) in action) theta.hat.grid <- seq( 0, 1, length = 500 ) lines( theta.hat.grid, dnorm( theta.hat.grid, expected.value.of.theta.hat, standard.error.of.theta.hat ), lwd = 2, col = 'red' ) # it doesn't look like the normal approximation to # the PMF of theta.hat is very good with n = 10, # but if i force the PMF to look more continuous # by using fewer histogram bars ... hist( repeated.sampling.data.set, prob = T, main = 'n = 10', breaks = 12, xlab = 'theta.hat' ) lines( theta.hat.grid, dnorm( theta.hat.grid, expected.value.of.theta.hat, standard.error.of.theta.hat ), lwd = 2, col = 'red' ) # ... you can see that it's already surprisingly good, # at least for some purposes # let's compute the simulation approximation to # P ( | theta.hat - theta | <= epsilon ) # for a small epsilon such as 0.01; we want this probability to be big epsilon <- 0.01 mean( abs( repeated.sampling.data.set - theta ) <= epsilon ) # [1] 0 # with n only 10, this accuracy goal cannot yet be met # the corresponding normal approximation is pnorm( epsilon / standard.error.of.theta.hat ) - pnorm( - epsilon / standard.error.of.theta.hat ) # [1] 0.05049665 ############################################################################### # let's crank n up to 100 and have a look: n <- 100 random.number.generator.seed <- 6350823 set.seed( random.number.generator.seed ) repeated.sampling.data.set <- rep( NA, M ) system.time( { for ( m in 1:M ) { simulated.sample.data.set <- sample( population.data.set, n, replace = T ) repeated.sampling.data.set[ m ] <- mean( simulated.sample.data.set ) } } ) # user system elapsed # 13.03 0.00 13.03 # interestingly, multiplying n by 10 has little effect on the clock time print( expected.value.of.theta.hat <- mean( repeated.sampling.data.set ) ) # [1] 0.4737013 # the right answer is still 0.4736842, # so with M = 1,000,000 simulation replications, # the simulation noise is again in the fourth digit # to the right of the decimal point sqrt( theta * ( 1 - theta ) / n ) # [1] 0.0499307 print( standard.error.of.theta.hat <- sd( repeated.sampling.data.set ) ) # [1] 0.04993065 # by lucky random sampling the simulation noise # is almost 0 for the standard error with this # random number seed table( repeated.sampling.data.set ) / M # repeated.sampling.data.set # 0.23 0.24 0.25 0.26 0.27 0.28 0.29 0.3 0.31 0.32 # 0.000001 0.000001 0.000003 0.000005 0.000017 0.000043 0.000074 0.000169 0.000355 0.000684 # 0.33 0.34 0.35 0.36 0.37 0.38 0.39 0.4 0.41 0.42 # 0.001166 0.002217 0.003585 0.005850 0.009286 0.013584 0.020008 0.027216 0.035270 0.045308 # 0.43 0.44 0.45 0.46 0.47 0.48 0.49 0.5 0.51 0.52 # 0.054952 0.063598 0.071134 0.076732 0.079489 0.079018 0.075618 0.069454 0.061233 0.051764 # 0.53 0.54 0.55 0.56 0.57 0.58 0.59 0.6 0.61 0.62 # 0.042328 0.033097 0.025022 0.017996 0.012468 0.008450 0.005274 0.003389 0.001900 0.001083 # 0.63 0.64 0.65 0.66 0.67 0.68 0.69 0.7 0.71 0.72 # 0.000574 0.000314 0.000145 0.000061 0.000036 0.000018 0.000005 0.000003 0.000002 0.000001 hist( repeated.sampling.data.set, prob = T, main = 'n = 100', breaks = 1000, xlab = 'theta.hat' ) # the repeated-sampling (frequentist) distribution of theta.hat # is still strongly discrete (PMF) with n only 10, # but it alreadly looks like a discrete version of the normal curve # (this is of course the Central Limit Theorem (CLT) in action) lines( theta.hat.grid, dnorm( theta.hat.grid, expected.value.of.theta.hat, standard.error.of.theta.hat ), lwd = 2, col = 'red' ) hist( repeated.sampling.data.set, prob = T, main = 'n = 100', breaks = 50, xlab = 'theta.hat' ) # the repeated-sampling (frequentist) distribution of theta.hat # is still strongly discrete (PMF) with n only 10, # but it alreadly looks like a discrete version of the normal curve # (this is of course the Central Limit Theorem (CLT) in action) lines( theta.hat.grid, dnorm( theta.hat.grid, expected.value.of.theta.hat, standard.error.of.theta.hat ), lwd = 2, col = 'red' ) mean( abs( repeated.sampling.data.set - theta ) <= epsilon ) # [1] 0.158507 # with n of 100, this accuracy goal is still too stringent # the corresponding normal approximation is pnorm( epsilon / standard.error.of.theta.hat ) - pnorm( - epsilon / standard.error.of.theta.hat ) # [1] 0.1587367 # the normal approximation is now superb # how large would n need to be for # P ( | theta.hat - theta | <= epsilon ) >= ( 1 - gamma ) # for some small gamma such as 0.05?