Sample size dependence on within-set variance spectrum for PLS¶

PLS outcomes depend on the within-set varianace spectrum. We have modeled the within-set variance spectrum as powerlaws, characterized by decay constants \(a_X\) and \(a_Y\) for datasets \(X\) and \(Y\), respectively. For simplicity, we have usually assumed a fixed value for the sum of the two decay constants: \(a_X+a_Y=-2\). Here, we show how required sample sizes depend on \(a_X + a_Y\).

Setup¶

[1]:

import numpy as np
import pandas as pd
import xarray as xr
import scipy.linalg
import scipy.stats
from scipy.stats import pearsonr, zscore
from scipy.spatial.distance import pdist, cdist, squareform

from sklearn import clone
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_validate, ShuffleSplit

from gemmr.data import load_outcomes, print_ds_stats
from gemmr.metrics import *
from gemmr.sample_size.interpolation import *
from gemmr.plot import heatmap_n_req
from gemmr.util import nPerFtr2n

import matplotlib
import holoviews as hv
from holoviews import opts
hv.extension('matplotlib')
hv.renderer('matplotlib').set_param(dpi=120)

from my_config import *

import warnings
from matplotlib import MatplotlibDeprecationWarning
warnings.simplefilter('ignore', MatplotlibDeprecationWarning)
warnings.filterwarnings(
    'ignore', 'aspect is not supported for Axes with xscale=log, yscale=linear', category=UserWarning
)  # holoviews emits this for log-linear plots

[2]:

ds_pls = load_outcomes('pls').sel(mode=0)

What’s in the outcome data file?

[3]:

print_ds_stats(ds_pls)

n_rep            100
n_per_ftr        [   3    4    8   16   32   64  128  256  512 1024 2048 4096 8192]
r                [0.1 0.3 0.5 0.7 0.9]
px               [  2   4   8  16  32  64 128]
ax+ay range     (-2.97, -0.03)
py              == px

<xarray.DataArray 'n_Sigmas' (px: 7, r: 5)>
array([[50, 75, 75, 75, 50],
       [50, 75, 75, 75, 50],
       [50, 75, 75, 75, 50],
       [ 0, 75, 75, 75, 50],
       [ 0, 75, 75, 75, 50],
       [ 0,  0, 25, 75, 50],
       [ 0,  0,  0, 50, 50]])
Coordinates:
  * r        (r) float64 0.1 0.3 0.5 0.7 0.9
  * px       (px) int64 2 4 8 16 32 64 128

power           calculated

Determine required sample sizes for different values of \(a_X + a_Y\)¶

[4]:

axPlusy = ds_pls.ax + ds_pls.ay

[5]:

tol = .5
qs = (.025, .975)
panels = {r: hv.Overlay() for r in ds_pls.r.values}
for target_axPlusy in [-2.5, -1.5, -.5]:#, -1.5, -2.5]:
    ds_pls_cond = ds_pls.where(np.abs(axPlusy - (target_axPlusy)) < tol).dropna('Sigma_id', 'all')
    n_req = calc_n_required_all_metrics(ds_pls_cond, search_dim='n_per_ftr')['combined']
    print(n_req.count('Sigma_id').values)
    n_req_mean = n_req.mean('Sigma_id')
    for r in n_req.r.values:
        panels[r] *= hv.Curve(n_req_mean.sel(r=r))

/anaconda3/envs/gemmrtest/lib/python3.8/site-packages/xarray/core/nanops.py:142: RuntimeWarning: Mean of empty slice
  return np.nanmean(a, axis=axis, dtype=dtype)

[[19 30 30 30 19]
 [19 30 30 30 19]
 [19 30 30 30 19]
 [ 0 30 30 30 19]
 [ 0 30 30 30 19]
 [ 0  0 11 30 19]
 [ 0  0  0 19 17]]
[[15 22 22 22 15]
 [15 22 22 22 15]
 [15 22 22 22 15]
 [ 0 22 22 22 15]
 [ 0 22 22 22 15]
 [ 0  0  7 22 15]
 [ 0  0  0 15 15]]
[[16 23 23 23 16]
 [16 23 23 23 16]
 [16 23 23 23 16]
 [ 0 23 23 23 16]
 [ 0 23 23 23 16]
 [ 0  0  7 23 16]
 [ 0  0  0 16 16]]

[6]:

clrs = hv.Palette('Dark2', samples=8).values[:3]

fig = (
    (
        panels[0.1].relabel('$r_\mathrm{true}=0.1$')
        * hv.Text(75, 10*(1.75)**2, '-2.5', fontsize=7, halign='right', valign='bottom').opts(color=clrs[0])
        * hv.Text(75, 10*(1.75)**1, '-1.5', fontsize=7, halign='right', valign='bottom').opts(color=clrs[1])
        * hv.Text(75, 10*(1.75)**0, '-0.5', fontsize=7, halign='right', valign='bottom').opts(color=clrs[2])
        * hv.Text(100, 10*(1.75)**3, '$a_x+a_y$', fontsize=7, halign='right', valign='bottom')
    )
    + panels[0.3].opts(ylabel='').relabel('$r_\mathrm{true}=0.3$')
    + panels[0.5].opts(ylabel='').relabel('$r_\mathrm{true}=0.5$')
    + panels[0.7].opts(ylabel='').relabel('$r_\mathrm{true}=0.7$')
).redim(
    px='Number of features',
    n_per_ftr_required='Req. sample size per ftr'
).cols(
    4
).opts(*fig_opts).opts(
    opts.Curve(color=hv.Cycle(clrs)),
    opts.Overlay(logx=True, logy=True),
    opts.Layout(fig_inches=(7, None), sublabel_position=(-.35, .95))
)

hv.save(fig, 'fig/figS_required_sample_size_PLS_axPlusay.pdf')

fig

[6]: