Cross-loadings¶

We explore how many samples are necessary to obtain stable cross-loadings, and compare to loadings.

Setup¶

[1]:

import numpy as np
import xarray as xr
from scipy.stats import pearsonr, spearmanr
from scipy.spatial.distance import pdist

from gemmr.data import load_outcomes, print_ds_stats
from gemmr.generative_model import *
from gemmr.sample_analysis import *
from gemmr.estimators import SVDCCA
from gemmr.sample_size import *
from gemmr.metrics import *

import holoviews as hv
from holoviews import opts
hv.renderer('matplotlib').set_param(dpi=120)
hv.extension('matplotlib')

from my_config import *

import warnings
warnings.simplefilter('ignore', UserWarning)
warnings.filterwarnings(
    'ignore', 'aspect is not supported for Axes with xscale=log, yscale=linear', category=UserWarning
)  # holoviews emits this for log-linear plots

/Users/mdh56/Projects/gemmr/gemmr/sample_analysis/analyzers.py:13: TqdmExperimentalWarning: Using `tqdm.autonotebook.tqdm` in notebook mode. Use `tqdm.tqdm` instead to force console mode (e.g. in jupyter console)
  from tqdm.autonotebook import tqdm

[2]:

ds_cca = load_outcomes('cca')
ds_pls = load_outcomes('pls', tag='axPlusay-2')

What’s in the outcome data?

[3]:

print_ds_stats(ds_cca)

n_modes          1
n_rep            100
n_per_ftr        [   3    4    8   16   32   64  128  256  512 1024]
r                [0.1 0.3 0.5 0.7 0.9]
px               [  2   4   8  16  32  64 128]
ax+ay range     (0.00, 0.00)
py              == px

<xarray.DataArray 'n_Sigmas' (px: 7, r: 5)>
array([[25, 25, 25, 25, 25],
       [25, 25, 25, 25, 25],
       [25, 25, 25, 25, 25],
       [25, 25, 25, 25, 25],
       [25, 25, 25, 25, 25],
       [ 0, 25, 25, 25, 25],
       [ 0, 25, 25, 25, 25]])
Coordinates:
  * r        (r) float64 0.1 0.3 0.5 0.7 0.9
  * px       (px) int64 2 4 8 16 32 64 128

power           calculated

[4]:

print_ds_stats(ds_pls)

n_modes          1
n_rep            100
n_per_ftr        [   3    4    8   16   32   64  128  256  512 1024 2048 4096 8192]
r                [0.1 0.3 0.5 0.7 0.9]
px               [  2   4   8  16  32  64 128]
ax+ay range     (-2.00, -2.00)
py              == px

<xarray.DataArray 'n_Sigmas' (px: 7, r: 5)>
array([[25, 25, 25, 25, 25],
       [25, 25, 25, 25, 25],
       [25, 25, 25, 25, 25],
       [ 0, 25, 25, 25, 25],
       [ 0, 25, 25, 25, 25],
       [ 0,  0,  0, 25, 25],
       [ 0,  0,  0, 25, 25]])
Coordinates:
  * r        (r) float64 0.1 0.3 0.5 0.7 0.9
  * px       (px) int64 2 4 8 16 32 64 128

power           calculated

True loadings vs true cross-loadings¶

How similar are loadings and cross-loadings in principle? We compare here their true values that were calculated directly from the assumed joint covariance matrices (i.e. they were not estimated from data).

[5]:

def _corr(x, y, corr_fun=pearsonr):
    mask = np.isfinite(x) & np.isfinite(y)
    if mask.sum() >= 2:
        return corr_fun(x[mask], y[mask])[0]
    else:
        return np.nan


def corrs_loading_crossloadings_true(ds, corr_fun=pearsonr):
    return xr.apply_ufunc(
        _corr,
        ds.x_loadings_true,
        ds.x_crossloadings_true,
        input_core_dims=[['x_feature'], ['x_feature']],
        vectorize=True,
        kwargs=dict(corr_fun=corr_fun)
    )

[6]:

cca_pearsonr_loadings_crossloadings_true = corrs_loading_crossloadings_true(ds_cca, pearsonr).mean('Sigma_id').rename('Pearson correlation between\ntrue loadings and\ntrue cross-loadings')
cca_spearmanr_loadings_crossloadings_true = corrs_loading_crossloadings_true(ds_cca, spearmanr).mean('Sigma_id').rename('Spearman correlation between\ntrue loadings and\ntrue cross-loadings')

pls_pearsonr_loadings_crossloadings_true = corrs_loading_crossloadings_true(ds_pls, pearsonr).mean('Sigma_id').rename('Pearson correlation between\ntrue loadings and\ntrue cross-loadings')
pls_spearmanr_loadings_crossloadings_true = corrs_loading_crossloadings_true(ds_pls, spearmanr).mean('Sigma_id').rename('Spearman correlation between\ntrue loadings and\ntrue cross-loadings')

/anaconda3/envs/gemmrtest/lib/python3.8/site-packages/xarray/core/nanops.py:142: RuntimeWarning: Mean of empty slice
  return np.nanmean(a, axis=axis, dtype=dtype)

[7]:

fig_true = (
    hv.QuadMesh(cca_pearsonr_loadings_crossloadings_true, kdims=['px', 'r']).opts(hooks=[suptitle_cca])
    + hv.QuadMesh(pls_pearsonr_loadings_crossloadings_true, kdims=['px', 'r']).opts(colorbar=True, ylabel='', hooks=[suptitle_pls])
    #+ hv.QuadMesh(cca_spearmanr_loadings_crossloadings_true, kdims=['px', 'r'])
    #+ hv.QuadMesh(pls_spearmanr_loadings_crossloadings_true, kdims=['px', 'r']).opts(colorbar=True)
).redim(
    px='Number of features',
    r='$r_\mathrm{true}$'
).cols(
    2
).opts(*fig_opts).opts(
    opts.QuadMesh(logx=True, cmap='inferno', clim=(0, 1)),
    opts.Layout(hspace=.5, sublabel_position=(-.45, .95), fig_inches=(3.42, 1.))
)

hv.save(fig_true, 'fig/figS_true_loadings_vs_true_crossloadings.pdf')

fig_true

[7]:

How many samples are required to obtain stable cross-loadings, compared to the number of required samples for loadings?¶

[8]:

target_error = 0.1

n_reqs_cca = calc_n_required_all_metrics(ds_cca.sel(mode=0), target_error=target_error, search_dim='n_per_ftr')
n_reqs_pls = calc_n_required_all_metrics(ds_pls.sel(mode=0), target_error=target_error, search_dim='n_per_ftr')

n_reqs_cca['crossloadingError'] = calc_n_required(
    mk_crossloadingError(ds_cca).mean('rep'),
    -target_error, target_error, search_dim='n_per_ftr')
n_reqs_pls['crossloadingError'] = calc_n_required(
    mk_crossloadingError(ds_pls).mean('rep'),
    -target_error, target_error, search_dim='n_per_ftr')

/anaconda3/envs/gemmrtest/lib/python3.8/site-packages/xarray/core/nanops.py:142: RuntimeWarning: Mean of empty slice
  return np.nanmean(a, axis=axis, dtype=dtype)

[9]:

pxs_cca = n_reqs_cca['combined'].px
pxs_pls = n_reqs_pls['combined'].px

rel_diff = lambda n_reqs, ref_metric: ((n_reqs['crossloadingError'] - n_reqs[ref_metric]) / n_reqs[ref_metric]).mean('Sigma_id').sel(px=n_reqs[ref_metric].px>2).rename('Required samples per feature\n(cross-loadings - loadings)/loadings')

fig = (
    hv.QuadMesh(rel_diff(n_reqs_cca, 'loadingError'), kdims=['px', 'r']).opts(hooks=[suptitle_cca])
    + hv.QuadMesh(rel_diff(n_reqs_pls, 'loadingError'), kdims=['px', 'r']).opts(
        ylabel='', colorbar=True, hooks=[suptitle_pls]
    )
).redim(
    px='Number of features',
    r='$r_\mathrm{true}$'
).cols(
    2
).opts(*fig_opts).opts(
    opts.QuadMesh(logx=True, logz=False, cmap='RdBu_r', symmetric=True, clim=(-.65, .65)),
    opts.Layout(hspace=.5, vspace=.5, sublabel_position=(-.4, .95), fig_inches=(3.42, 1.)),
)

hv.save(fig, 'fig/fig7_crossloadings.pdf')

fig

[9]: