Bootstrap

What are the properties of bootstrapped CCA and PLS estimations?

Setup

[1]:
import numpy as np
import xarray as xr
import scipy.linalg
import scipy.stats
from scipy.stats import pearsonr, zscore, beta
from scipy.spatial.distance import pdist, cdist, squareform

import pandas as pd

from sklearn.decomposition import PCA, SparsePCA
from sklearn.linear_model import LinearRegression
from sklearn.utils import check_random_state

from gemmr.data import load_outcomes, print_ds_stats
from gemmr.sample_analysis import *
from gemmr.sample_size import *
from gemmr.metrics import *

import matplotlib
import holoviews as hv
from holoviews import opts
hv.extension('matplotlib')
hv.renderer('matplotlib').set_param(dpi=120)

from my_config import *

import warnings
from matplotlib import MatplotlibDeprecationWarning
warnings.simplefilter('ignore', MatplotlibDeprecationWarning)
warnings.filterwarnings(
    'ignore', 'aspect is not supported for Axes with xscale=log, yscale=linear', category=UserWarning
)  # holoviews emits this for log-linear plots
/Users/mdh56/Projects/gemmr/gemmr/sample_analysis/analyzers.py:13: TqdmExperimentalWarning: Using `tqdm.autonotebook.tqdm` in notebook mode. Use `tqdm.tqdm` instead to force console mode (e.g. in jupyter console)
  from tqdm.autonotebook import tqdm
[2]:
r_clrs = hv.Palette(cmap_r, samples=4).values
[3]:
res = dict(
    cca=load_outcomes('cca', tag='bs').sel(mode=0),
    pls=load_outcomes('pls', tag='axPlusay-2_bs').sel(mode=0)
)

What’s in the outcome data?

[4]:
print_ds_stats(res['cca'])
n_rep            100
n_bs             100
n_per_ftr        [  3   4   8  16  32  64 128]
r                [0.3 0.5 0.7 0.9]
px               [ 2  4  8 16 32 64]
ax+ay range     (0.00, 0.00)
py              == px

<xarray.DataArray 'n_Sigmas' (px: 6, r: 4)>
array([[5, 5, 5, 5],
       [5, 5, 5, 5],
       [5, 5, 5, 5],
       [5, 5, 5, 5],
       [5, 5, 5, 5],
       [5, 5, 5, 5]])
Coordinates:
  * r        (r) float64 0.3 0.5 0.7 0.9
  * px       (px) int64 2 4 8 16 32 64

power           not calculated
[5]:
print_ds_stats(res['pls'])
n_rep            100
n_bs             100
n_per_ftr        [   3    4    8   16   32   64  128  256  512 1024]
r                [0.3 0.5 0.7 0.9]
px               [ 2  4  8 16 32 64]
ax+ay range     (-2.00, -2.00)
py              == px

<xarray.DataArray 'n_Sigmas' (px: 6, r: 4)>
array([[5, 5, 5, 5],
       [5, 5, 5, 5],
       [5, 5, 5, 5],
       [5, 5, 5, 5],
       [5, 5, 5, 5],
       [5, 5, 5, 5]])
Coordinates:
  * r        (r) float64 0.3 0.5 0.7 0.9
  * px       (px) int64 2 4 8 16 32 64

power           not calculated

Association strength

[6]:
panels_assoc_strenth = {model: hv.Overlay() for model in res}

for model in res:

    assoc_strength = res[model].between_assocs.mean('rep').mean('Sigma_id').mean('px')
    assoc_strength_bs = res[model].between_assocs_bs.mean('bs').mean('rep').mean('Sigma_id').mean('px')

    for ri, r in enumerate([.3, .5, .7, .9]):
        panels_assoc_strenth[model] *= hv.Curve(
            assoc_strength.sel(r=r).rename('y')
        ).opts(color=r_clrs[ri])
        panels_assoc_strenth[model] *= hv.Curve(
            assoc_strength_bs.sel(r=r).rename('y')
        ).opts(color=r_clrs[ri], linestyle='--')

    if model == 'cca':
        panels_assoc_strenth[model] *= (
            hv.Text(25, .36, 'in-sample', halign='right', valign='top', fontsize=8, kdims=['n_per_ftr', 'y'])
            * hv.Text(25, .41, 'bootstrap', halign='left', valign='bottom', fontsize=8, kdims=['n_per_ftr', 'y'])
        )
    elif model == 'pls':
        panels_assoc_strenth[model] *= (
            hv.Text(100, .5, r'$r_\mathrm{true}=$', halign='right', valign='top', fontsize=7, kdims=['n_per_ftr', 'y'])
            * hv.Text(140, .5, '0.3', halign='left', valign='top', fontsize=7, kdims=['n_per_ftr', 'y']).opts(color=r_clrs[0])
            * hv.Text(1000, .5, '0.5', halign='right', valign='top', fontsize=7, kdims=['n_per_ftr', 'y']).opts(color=r_clrs[1])
            * hv.Text(140, .4, '0.7', halign='left', valign='top', fontsize=7, kdims=['n_per_ftr', 'y']).opts(color=r_clrs[2])
            * hv.Text(1000, .4, '0.9', halign='right', valign='top', fontsize=7, kdims=['n_per_ftr', 'y']).opts(color=r_clrs[3])
        )


    panels_assoc_strenth[model] = panels_assoc_strenth[model].redim(
        n_per_ftr=hv.Dimension('n_per_ftr_{}'.format(model), label='Samples per feature'),
        y=hv.Dimension('assoc_strengh_{}'.format(model), label='Association strength')
    )

fig_assoc_strength = (
    panels_assoc_strenth['cca']
    + panels_assoc_strenth['pls'].opts(ylabel='')
).opts(*fig_opts).opts(
    opts.Curve(linewidth=1.5),
    opts.Overlay(logx=True, logy=True, yticks=5)
)

fig_assoc_strength
/anaconda3/envs/gemmrtest/lib/python3.8/site-packages/xarray/core/nanops.py:142: RuntimeWarning: Mean of empty slice
  return np.nanmean(a, axis=axis, dtype=dtype)
[6]:

Weight error

[7]:
panels_weight_error = {model: hv.Overlay() for model in res}

for model in res:
    for ri, r in enumerate([.3, .5, .7, .9]):
        if r in [.5, .7]: continue
        panels_weight_error[model] *= (
            hv.Curve(
                mk_weightError(res[model].sel(r=r)).mean('rep').mean('Sigma_id').mean('px'),
                label='in-sample'
            ).opts(color=r_clrs[ri])
            * hv.Curve(
                mk_weightError(res[model].sel(r=r), '_bs').mean('bs').mean('rep').mean('Sigma_id').mean('px'),
                label='bootstrap'
            ).opts(color=r_clrs[ri], linestyle='--')
        ).redim(
            y='Weight error',
            n_per_ftr='Samples per feature'
        )

fig_weight_error = (
    panels_weight_error['cca'].opts(xlim=(3, 128))
    + panels_weight_error['pls'].opts(ylabel='')
).opts(*fig_opts).opts(
    opts.Curve(linewidth=1.5),
    opts.Overlay(logx=True, logy=True, ylim=(None, 1), show_legend=False)
)

fig_weight_error
/anaconda3/envs/gemmrtest/lib/python3.8/site-packages/xarray/core/nanops.py:142: RuntimeWarning: Mean of empty slice
  return np.nanmean(a, axis=axis, dtype=dtype)
[7]:
[8]:
panels_delta_n_req_per_ftr = {model: hv.Overlay() for model in res}

target_error = 0.1

for model in res:

    n_req_per_ftr_inSample = calc_n_required(
        mk_weightError(res[model]).mean('rep'),
        -target_error,
        target_error
    ).mean('Sigma_id')
    n_req_per_ftr_bs = calc_n_required(
        mk_weightError(res[model], '_bs').mean('rep').mean('bs'),
        -target_error,
        target_error
    ).mean('Sigma_id')

    panels_delta_n_req_per_ftr[model] *= hv.QuadMesh(
        (n_req_per_ftr_bs - n_req_per_ftr_inSample).rename('delta'),
        kdims=['px', 'r']
    )

fig_delta_n_req_per_ftr = (
    panels_delta_n_req_per_ftr['cca'].opts(sublabel_position=(-.45, .85))
    + panels_delta_n_req_per_ftr['pls'].opts(
        opts.Overlay(yaxis='bare', sublabel_position=(-.3, .85)),
        opts.QuadMesh(colorbar=True)
    )
).redim(
    px='Number of features',
    r=r'$r_\mathrm{true}$',
    delta='Req. num samples\n(bootstrap - in-sample)'
).opts(*fig_opts).opts(
    opts.QuadMesh(logx=True, clim=(0, 50), cmap='viridis')
)

fig_delta_n_req_per_ftr
/anaconda3/envs/gemmrtest/lib/python3.8/site-packages/xarray/core/nanops.py:142: RuntimeWarning: Mean of empty slice
  return np.nanmean(a, axis=axis, dtype=dtype)
[8]:

Weight uncertainty

[9]:
def iqr(x):
    if np.isnan(x).any():
        return np.nan
    else:
        q = np.quantile(x, [.25, .75])
        iqr = q[1] - q[0]
        return iqr


def calc_iqr(da, dim):
    return xr.apply_ufunc(
        iqr,
        da,
        input_core_dims=[[dim]],
        vectorize=True
    )
[10]:
iqrs_inSample = {model: calc_iqr(res[model].x_weights, 'rep') for model in res}
iqrs_bs = {model: calc_iqr(res[model].x_weights_bs, 'bs').mean('rep') for model in res}
/anaconda3/envs/gemmrtest/lib/python3.8/site-packages/xarray/core/nanops.py:142: RuntimeWarning: Mean of empty slice
  return np.nanmean(a, axis=axis, dtype=dtype)
[11]:
panels_weight_iqr = {model: hv.Overlay() for model in res}

for model in res:
    panels_weight_iqr[model] *= (
        hv.Scatter(
            (iqrs_inSample[model].values.ravel(), iqrs_bs[model].values.ravel())
        )
        * hv.Curve(([0, 1.5], [0, 1.5]))
    ).redim(
        x='IQR across rep. datasets',
        y='IQR across bootstrap iter'
    ).opts(
        opts.Curve(color='black', linewidth=1, linestyle='--'),
        opts.Scatter(s=5)
    )

fig_weight_iqr = (
    panels_weight_iqr['cca'].opts(sublabel_position=(-.75, .85))
    + panels_weight_iqr['pls'].opts(ylabel='', sublabel_position=(-.55, .85))
).opts(*fig_opts)

fig_weight_iqr
[11]:

Assemble figure

[12]:
fig = (
    fig_assoc_strength.Overlay.I.opts(
        sublabel_position=(-.55, .85), hooks=[suptitle_cca, Format_log_axis('x', major_numticks=3, minor_numticks=5)]
    )
    + fig_assoc_strength.Overlay.II.opts(
        sublabel_position=(-.55, .85), hooks=[suptitle_pls, Format_log_axis('x', major_numticks=4, minor_numticks=5)]
    )
    + fig_weight_error.Overlay.I.opts(
        sublabel_position=(-.55, .85), hooks=[Format_log_axis('x', major_numticks=3, minor_numticks=5)]
    )
    + fig_weight_error.Overlay.II.opts(
        sublabel_position=(-.55, .85), hooks=[Format_log_axis('x', major_numticks=4, minor_numticks=5)]
    )
    + fig_delta_n_req_per_ftr
    + fig_weight_iqr
).cols(
    2
).opts(*fig_opts).opts(
    opts.Layout(hspace=.35, vspace=.6)
)

hv.save(fig, 'fig/figS_bootstrap.pdf')

fig
[12]: