Sample size dependence on powerlaw-decay constants

PLS outcomes depend on the within-set varianace spectrum. We have modeled the within-set variance spectrum as powerlaws, characterized by decay constants \(a_X\) and \(a_Y\) for datasets \(X\) and \(Y\), respectively. For simplicity, we have usually assumed a fixed value for the sum of the two decay constants: \(a_X+a_Y=-2\). Here, we show how required sample sizes depend on \(a_X + a_Y\).

Setup

[1]:
import numpy as np
import pandas as pd
import xarray as xr
import scipy.linalg
import scipy.stats
from scipy.stats import pearsonr, zscore
from scipy.spatial.distance import pdist, cdist, squareform

from sklearn import clone
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_validate, ShuffleSplit

from gemmr.data import load_outcomes, print_ds_stats
from gemmr.metrics import *
from gemmr.sample_size.interpolation import *
from gemmr.plot import heatmap_n_req
from gemmr.util import nPerFtr2n, subset_ds

import matplotlib
import holoviews as hv
from holoviews import opts
hv.extension('matplotlib')
hv.renderer('matplotlib').param.set_param(dpi=120)

from my_config import *

import warnings
from matplotlib import MatplotlibDeprecationWarning
warnings.simplefilter('ignore', MatplotlibDeprecationWarning)
warnings.filterwarnings(
    'ignore', 'aspect is not supported for Axes with xscale=log, yscale=linear', category=UserWarning
)  # holoviews emits this for log-linear plots
[2]:
data_home = None
ds_cca = load_outcomes('sweep_cca_cca_random_sum+-3+0_wOtherModel', data_home=data_home).sel(mode=0)
ds_pls = load_outcomes('sweep_pls_pls_random_sum+-3+0_wOtherModel', data_home=data_home).sel(mode=0)
Loading data from subfolder 'gemmr_latest'
Loading data from subfolder 'gemmr_latest'
[5]:
ds_cca = ds_cca.sel(px=ds_cca.px < 128)
ds_pls = ds_pls.sel(px=ds_pls.px < 128)
[6]:
ds_cca = subset_ds(ds_cca, n_keep=25)
ds_pls = subset_ds(ds_pls, n_keep=25)

What’s in the outcome data file?

[7]:
print_ds_stats(ds_cca)
n_rep            100
n_per_ftr        [   3    4    8   16   32   64  128  256  512 1024 2048 4096 8192]
r                [0.1 0.3 0.5 0.7]
px               [ 2  4  8 16 32 64]
ax+ay range     (-2.97, -0.10)
py              == px

<xarray.DataArray 'n_Sigmas' (px: 6, r: 4)>
array([[25, 25, 25, 25],
       [25, 25, 25, 25],
       [25, 25, 25, 25],
       [25, 25, 25, 25],
       [25, 25, 25, 25],
       [ 0, 25, 25, 25]])
Coordinates:
  * r        (r) float64 0.1 0.3 0.5 0.7
  * px       (px) int64 2 4 8 16 32 64

power           calculated
[8]:
print_ds_stats(ds_pls)
n_rep            100
n_per_ftr        [   3    4    8   16   32   64  128  256  512 1024 2048 4096 8192]
r                [0.1 0.3 0.5 0.7]
px               [ 2  4  8 16 32 64]
ax+ay range     (-2.97, -0.10)
py              == px

<xarray.DataArray 'n_Sigmas' (px: 6, r: 4)>
array([[25, 25, 25, 25],
       [25, 25, 25, 25],
       [25, 25, 25, 25],
       [ 0, 25, 25, 25],
       [ 0, 25, 25, 25],
       [ 0,  0,  0, 21]])
Coordinates:
  * r        (r) float64 0.1 0.3 0.5 0.7
  * px       (px) int64 2 4 8 16 32 64

power           calculated
[9]:
assert np.all(np.isnan(ds_pls.py) | (ds_pls.py == ds_pls.px))

Determine required sample sizes for different values of \(a_X + a_Y\)

[10]:
def plot_powerlaw_dependence(ds, tol=.5, qs=(.025, .975), sublabel_skip=0):

    axPlusy = ds.ax + ds.ay

    panels = {r: hv.Overlay() for r in ds.r.values}
    for target_axPlusy in [-2.5, -1.5, -.5]:
        ds_cond = ds.where(np.abs(axPlusy - (target_axPlusy)) < tol).dropna('Sigma_id', 'all')
        n_req = calc_n_required_all_metrics(ds_cond, search_dim='n_per_ftr')['combined']
        n_req = n_req.assign_coords(px=2*n_req.px).rename(px='ptot')  # assumes py == px
        print(n_req.count('Sigma_id').values)
        n_req_mean = n_req.mean('Sigma_id')
        for r in n_req.r.values:
            panels[r] *= hv.Curve(n_req_mean.sel(r=r))

    # --- assemble figure

    clrs = hv.Palette('Dark2', samples=8).values[:3]

    fig = (
        (
            panels[0.1]
            * hv.Text(75, 10*(1.75)**2, '-2.5', fontsize=7, halign='right', valign='bottom').opts(color=clrs[0])
            * hv.Text(75, 10*(1.75)**1, '-1.5', fontsize=7, halign='right', valign='bottom').opts(color=clrs[1])
            * hv.Text(75, 10*(1.75)**0, '-0.5', fontsize=7, halign='right', valign='bottom').opts(color=clrs[2])
            * hv.Text(100, 10*(1.75)**3, '$a_x+a_y$', fontsize=7, halign='right', valign='bottom')
        ).relabel('$r_\mathrm{true}=0.1$')
        + panels[0.3].opts(ylabel='', ).relabel('$r_\mathrm{true}=0.3$')
        + panels[0.5].opts(ylabel='', ).relabel('$r_\mathrm{true}=0.5$')
        + panels[0.7].opts(ylabel='', ).relabel('$r_\mathrm{true}=0.7$')
    ).redim(
        ptot='Number of features',
        n_per_ftr_required='Req. sample size per ftr'
    ).cols(
        4
    ).opts(*fig_opts).opts(
        opts.Curve(color=hv.Cycle(clrs)),
        opts.Overlay(logx=True, logy=True),
        opts.Layout(fig_inches=(7, None), sublabel_position=(-.35, .95), sublabel_format="{alpha}")
    )

    return fig
[11]:
fig_cca = plot_powerlaw_dependence(ds_cca, sublabel_skip=4)
fig_pls = plot_powerlaw_dependence(ds_pls, sublabel_skip=8)

fig = (
    fig_cca
    + fig_pls
).cols(
    4
).opts(*fig_opts).opts(
    opts.Layout(fig_inches=(7, None), sublabel_position=(-.35, .95), sublabel_format="{alpha}")
)

hv.save(fig, 'fig/figS_powerlaw_dependence.pdf')

fig
[[10 10 10 10]
 [10 10  9  9]
 [10 10 10 10]
 [ 7 10 10 10]
 [ 7  8  9 10]
 [ 0  6  6  8]]
[[8 8 8 8]
 [8 8 8 8]
 [8 8 8 8]
 [8 8 8 8]
 [8 8 8 8]
 [0 8 8 8]]
[[7 7 7 7]
 [6 7 7 7]
 [7 7 7 7]
 [7 7 7 7]
 [7 7 7 7]
 [0 7 7 7]]
[[10 10  9 10]
 [ 9  9  9  9]
 [10 10 10 10]
 [ 0 10 10  9]
 [ 0  6  7  4]
 [ 0  0  0  2]]
[[ 8  8  8  8]
 [ 8  8  7  8]
 [ 8  8  8  8]
 [ 0  8  8  8]
 [ 0  8  8 10]
 [ 0  0  0  7]]
[[ 7  7  7  7]
 [ 7  7  7  7]
 [ 7  7  7  7]
 [ 0  7  7  7]
 [ 0  7  8 11]
 [ 0  0  0 11]]
[11]: