In [62]:
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
In [46]:
# We can infer the distribution of an expected value (ie. the confidence interval of the average observation)
# by sampling from the population of observations in multiple iterations. For each iteration we record the mean
# The result is a sample of means.
def bootstrap_mean(population, sample_size, num_samples):
    sampled_expected_values = []
    for i in range(num_samples):
        sample = np.random.choice(a=population, size=sample_size, replace=True)
        sampled_expected_values.append(np.average(sample))
    return sampled_expected_values
In [53]:
# Notice how the sample size affects our distribution of sampled means (ie bootstrapped confidence interval).
# Which sample size should be pick?
# Sample sizes of 1 and len(population) both give the correct expected value
# but fail to provide meaninful information about how confident we are about the expected value.
num_samples=1000
sample_sizes = [30, 1, 5, 10, 20, 30, 100, 500, population_size]
fig, axs = plt.subplots(len(sample_sizes), 1, sharex=True)

for i in range(0, len(sample_sizes)):
    np.random.seed(15251)
    population_size = 1000
    population = np.random.normal(loc=100, scale=10, size=population_size)
    
    sample_size = sample_sizes[i]
    np.random.seed(65390)
    sampled_mean = bootstrap_ci(population=population, sample_size=sample_size, num_samples=num_samples)
    axs[i].hist(sampled_mean, bins=30)
In [59]:
# How does increasing the population size change the distribution of our sampled means
# (ie bootstrapped confidience intervals), assuming fixed sample size of 30?
# After a certain point it doesn't.
# Question to the reader: why doesn't doesn't the variance of the sampled means decrease with more evidence?
# Hint: Bootstrapping confidence intervals of expected values is distribution agnostic.
num_samples=1000
population_sizes= [10, 30, 100, 1000, 10000, 100000, 1000000 ]
fig, axs = plt.subplots(len(population_sizes), 1, sharex=True)

for i in range(0, len(population_sizes)):
    np.random.seed(15251)
    population_size = population_sizes[i]
    population = np.random.normal(loc=100, scale=10, size=population_size)
    
    np.random.seed(65390)
    sampled_mean = bootstrap_ci(population=population, sample_size=30, num_samples=num_samples)
    axs[i].hist(sampled_mean, bins=30)
In [65]:
# How can one overcome this issue of bootstrapped confidence intervals?
# I recommend making the sample size a function of the population size.
# This is an ugly engineering solution to an unsolvable problem.
# Setting sample_size=sqrt(population_size) creates the illusion that our confidence interval
# is shrinking.
num_samples=1000
population_sizes= [10, 30, 100, 1000, 10000, 100000, 1000000 ]
fig, axs = plt.subplots(len(population_sizes), 1, sharex=True)

for i in range(0, len(population_sizes)):
    np.random.seed(15251)
    population_size = population_sizes[i]
    population = np.random.normal(loc=100, scale=10, size=population_size)
    
    np.random.seed(65390)
    sample_size = int(math.sqrt(population_size))
    if sample_size < 10:
        sample_size = 10
    sampled_mean = bootstrap_ci(population=population, sample_size=sample_size, num_samples=num_samples)
    axs[i].hist(sampled_mean, bins=30)
In [ ]: