How many PCs are really necessary?

Setup

[1]:
import os

import xarray as xr

from gemmr.data.loaders import load_other_outcomes

import holoviews as hv
from holoviews import opts
hv.extension('matplotlib')
hv.renderer('matplotlib').param.set_param(dpi=120)

from my_config import *

import warnings
from matplotlib import MatplotlibDeprecationWarning
warnings.simplefilter('ignore', MatplotlibDeprecationWarning)
warnings.filterwarnings(
    'ignore', 'aspect is not supported for Axes with xscale=log, yscale=linear', category=UserWarning
)  # holoviews emits this for log-linear plots

Prepare data

The following takes a while to run:

import os
import numpy as np
import xarray as xr
from sklearn.model_selection import cross_val_score
from tqdm import trange
from gemmr.estimators import SVDCCA
from gemmr.estimators.helpers import pearson_transform_scorer

_ds = xr.open_dataset(
    os.path.expanduser('~/gemmr_data/empirical/ukb_fmri_smithpreproc_pcs.nc')
)
ukbb_fMRI_X, ukbb_fMRI_Y = _ds['X_pcs'].values, _ds['Y_pcs'].values

cca_rs = np.nan * np.empty((100, 100))
cca_rs_cv = np.nan * np.empty((100, 100))
cca = SVDCCA()
for i in trange(1, 100):
    for j in trange(1, 100, leave=False):
        cca_rs[i,j] = cca.fit(ukbb_fMRI_X[:, :i], ukbb_fMRI_Y[:, :j]).assocs_[0]
        cca_rs_cv[i, j] = cross_val_score(
            cca, ukbb_fMRI_X[:, :i], ukbb_fMRI_Y[:, :j], cv=5,
            scoring=pearson_transform_scorer, n_jobs=None
        ).mean()

ds = xr.Dataset(dict(
    cca_rs=xr.DataArray(cca_rs, dims=('n_X_pcs', 'n_Y_pcs')),
    cca_rs_cv=xr.DataArray(cca_rs_cv, dims=('n_X_pcs', 'n_Y_pcs'))
))
ds.to_netcdf('ukb_how_many_pcs_cca_inSampleCV.nc')
import os
import numpy as np
import xarray as xr
from sklearn.model_selection import cross_val_score
from tqdm import trange
from gemmr.estimators import SVDPLS
from gemmr.estimators.helpers import cov_transform_scorer

_ds = xr.open_dataset(
    os.path.expanduser('~/gemmr_data/empirical/ukb_fmri_smithpreproc_pcs.nc')
)
ukbb_fMRI_X, ukbb_fMRI_Y = _ds['X_pcs'].values, _ds['Y_pcs'].values

pls_covs = np.nan * np.empty((100, 100))
pls_covs_cv = np.nan * np.empty((100, 100))
pls = SVDPLS()
for i in trange(1, 100):
    for j in trange(1, 100, leave=False):
        pls_covs[i,j] = pls.fit(ukbb_fMRI_X[:, :i], ukbb_fMRI_Y[:, :j]).assocs_[0]
        pls_covs_cv[i, j] = cross_val_score(
            pls, ukbb_fMRI_X[:, :i], ukbb_fMRI_Y[:, :j], cv=5,
            scoring=cov_transform_scorer, n_jobs=None
        ).mean()

ds = xr.Dataset(dict(
    pls_covs=xr.DataArray(pls_covs, dims=('n_X_pcs', 'n_Y_pcs')),
    pls_covs_cv=xr.DataArray(pls_covs_cv, dims=('n_X_pcs', 'n_Y_pcs'))
))
ds.to_netcdf('ukb_how_many_pcs_pls_inSampleCV.nc')

The previous code segments store their outputs in files for later usage. Now load these previously generated data:

[2]:
ds_cca = xr.load_dataset('ukb_how_many_pcs_cca_inSampleCV.nc')
ds_pls = xr.load_dataset('ukb_how_many_pcs_pls_inSampleCV.nc')
[3]:
cca_rs, cca_rs_cv = ds_cca.cca_rs.values, ds_cca.cca_rs_cv.values
pls_covs, pls_covs_cv = ds_pls.pls_covs.values, ds_pls.pls_covs_cv.values

Results

[8]:
fig = (
    hv.Raster(cca_rs[1:, 1:]).relabel('CCA').opts(clabel='Canonical correlation', xlabel='')
    + hv.Raster(cca_rs_cv[1:, 1:]).relabel('CCA (cross-validated)').opts(clabel='Canonical correlation', xlabel='', ylabel='')
    + hv.Raster(pls_covs[1:, 1:] / 3500).relabel('PLS').opts(clabel='PLS covariance / a.u.')
    + hv.Raster(pls_covs_cv[1:, 1:] / 3500).relabel('PLS (cross-validated)').opts(clabel='PLS covariance / a.u.', ylabel='')
).redim(
    x='# top behav. PCs retained',
    y='# top neuroim. PCs retained'
).cols(
    2
).opts(*fig_opts).opts(
    opts.Raster(colorbar=True, cmap='viridis', xticks=[0, 20, 40, 60, 80], invert_yaxis=True),
    opts.Layout(hspace=.6, vspace=.45, sublabel_position=(-.28, .95))
)

hv.save(fig, "fig/figS_cca_pls_assocStrength_dependingOnNRetainedPCs.pdf")

fig
[8]: