Sample size dependence on within-set variance spectrum for PLS¶
PLS outcomes depend on the within-set varianace spectrum. We have modeled the within-set variance spectrum as powerlaws, characterized by decay constants \(a_X\) and \(a_Y\) for datasets \(X\) and \(Y\), respectively. For simplicity, we have usually assumed a fixed value for the sum of the two decay constants: \(a_X+a_Y=-2\). Here, we show how required sample sizes depend on \(a_X + a_Y\).
Setup¶
[1]:
import numpy as np
import pandas as pd
import xarray as xr
import scipy.linalg
import scipy.stats
from scipy.stats import pearsonr, zscore
from scipy.spatial.distance import pdist, cdist, squareform
from sklearn import clone
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_validate, ShuffleSplit
from gemmr.data import load_outcomes, print_ds_stats
from gemmr.metrics import *
from gemmr.sample_size.interpolation import *
from gemmr.plot import heatmap_n_req
from gemmr.util import nPerFtr2n
import matplotlib
import holoviews as hv
from holoviews import opts
hv.extension('matplotlib')
hv.renderer('matplotlib').set_param(dpi=120)
from my_config import *
import warnings
from matplotlib import MatplotlibDeprecationWarning
warnings.simplefilter('ignore', MatplotlibDeprecationWarning)
warnings.filterwarnings(
'ignore', 'aspect is not supported for Axes with xscale=log, yscale=linear', category=UserWarning
) # holoviews emits this for log-linear plots
[2]:
ds_pls = load_outcomes('pls').sel(mode=0)
What’s in the outcome data file?
[3]:
print_ds_stats(ds_pls)
n_rep 100
n_per_ftr [ 3 4 8 16 32 64 128 256 512 1024 2048 4096 8192]
r [0.1 0.3 0.5 0.7 0.9]
px [ 2 4 8 16 32 64 128]
ax+ay range (-2.97, -0.03)
py == px
<xarray.DataArray 'n_Sigmas' (px: 7, r: 5)>
array([[50, 75, 75, 75, 50],
[50, 75, 75, 75, 50],
[50, 75, 75, 75, 50],
[ 0, 75, 75, 75, 50],
[ 0, 75, 75, 75, 50],
[ 0, 0, 25, 75, 50],
[ 0, 0, 0, 50, 50]])
Coordinates:
* r (r) float64 0.1 0.3 0.5 0.7 0.9
* px (px) int64 2 4 8 16 32 64 128
power calculated
Determine required sample sizes for different values of \(a_X + a_Y\)¶
[4]:
axPlusy = ds_pls.ax + ds_pls.ay
[5]:
tol = .5
qs = (.025, .975)
panels = {r: hv.Overlay() for r in ds_pls.r.values}
for target_axPlusy in [-2.5, -1.5, -.5]:#, -1.5, -2.5]:
ds_pls_cond = ds_pls.where(np.abs(axPlusy - (target_axPlusy)) < tol).dropna('Sigma_id', 'all')
n_req = calc_n_required_all_metrics(ds_pls_cond, search_dim='n_per_ftr')['combined']
print(n_req.count('Sigma_id').values)
n_req_mean = n_req.mean('Sigma_id')
for r in n_req.r.values:
panels[r] *= hv.Curve(n_req_mean.sel(r=r))
/anaconda3/envs/gemmrtest/lib/python3.8/site-packages/xarray/core/nanops.py:142: RuntimeWarning: Mean of empty slice
return np.nanmean(a, axis=axis, dtype=dtype)
[[19 30 30 30 19]
[19 30 30 30 19]
[19 30 30 30 19]
[ 0 30 30 30 19]
[ 0 30 30 30 19]
[ 0 0 11 30 19]
[ 0 0 0 19 17]]
[[15 22 22 22 15]
[15 22 22 22 15]
[15 22 22 22 15]
[ 0 22 22 22 15]
[ 0 22 22 22 15]
[ 0 0 7 22 15]
[ 0 0 0 15 15]]
[[16 23 23 23 16]
[16 23 23 23 16]
[16 23 23 23 16]
[ 0 23 23 23 16]
[ 0 23 23 23 16]
[ 0 0 7 23 16]
[ 0 0 0 16 16]]
[6]:
clrs = hv.Palette('Dark2', samples=8).values[:3]
fig = (
(
panels[0.1].relabel('$r_\mathrm{true}=0.1$')
* hv.Text(75, 10*(1.75)**2, '-2.5', fontsize=7, halign='right', valign='bottom').opts(color=clrs[0])
* hv.Text(75, 10*(1.75)**1, '-1.5', fontsize=7, halign='right', valign='bottom').opts(color=clrs[1])
* hv.Text(75, 10*(1.75)**0, '-0.5', fontsize=7, halign='right', valign='bottom').opts(color=clrs[2])
* hv.Text(100, 10*(1.75)**3, '$a_x+a_y$', fontsize=7, halign='right', valign='bottom')
)
+ panels[0.3].opts(ylabel='').relabel('$r_\mathrm{true}=0.3$')
+ panels[0.5].opts(ylabel='').relabel('$r_\mathrm{true}=0.5$')
+ panels[0.7].opts(ylabel='').relabel('$r_\mathrm{true}=0.7$')
).redim(
px='Number of features',
n_per_ftr_required='Req. sample size per ftr'
).cols(
4
).opts(*fig_opts).opts(
opts.Curve(color=hv.Cycle(clrs)),
opts.Overlay(logx=True, logy=True),
opts.Layout(fig_inches=(7, None), sublabel_position=(-.35, .95))
)
hv.save(fig, 'fig/figS_required_sample_size_PLS_axPlusay.pdf')
fig
[6]: