How many PCs are really necessary?
Setup
[1]:
import os
import xarray as xr
from gemmr.data.loaders import load_other_outcomes
import holoviews as hv
from holoviews import opts
hv.extension('matplotlib')
hv.renderer('matplotlib').param.set_param(dpi=120)
from my_config import *
import warnings
from matplotlib import MatplotlibDeprecationWarning
warnings.simplefilter('ignore', MatplotlibDeprecationWarning)
warnings.filterwarnings(
'ignore', 'aspect is not supported for Axes with xscale=log, yscale=linear', category=UserWarning
) # holoviews emits this for log-linear plots
Prepare data
The following takes a while to run:
import os
import numpy as np
import xarray as xr
from sklearn.model_selection import cross_val_score
from tqdm import trange
from gemmr.estimators import SVDCCA
from gemmr.estimators.helpers import pearson_transform_scorer
_ds = xr.open_dataset(
os.path.expanduser('~/gemmr_data/empirical/ukb_fmri_smithpreproc_pcs.nc')
)
ukbb_fMRI_X, ukbb_fMRI_Y = _ds['X_pcs'].values, _ds['Y_pcs'].values
cca_rs = np.nan * np.empty((100, 100))
cca_rs_cv = np.nan * np.empty((100, 100))
cca = SVDCCA()
for i in trange(1, 100):
for j in trange(1, 100, leave=False):
cca_rs[i,j] = cca.fit(ukbb_fMRI_X[:, :i], ukbb_fMRI_Y[:, :j]).assocs_[0]
cca_rs_cv[i, j] = cross_val_score(
cca, ukbb_fMRI_X[:, :i], ukbb_fMRI_Y[:, :j], cv=5,
scoring=pearson_transform_scorer, n_jobs=None
).mean()
ds = xr.Dataset(dict(
cca_rs=xr.DataArray(cca_rs, dims=('n_X_pcs', 'n_Y_pcs')),
cca_rs_cv=xr.DataArray(cca_rs_cv, dims=('n_X_pcs', 'n_Y_pcs'))
))
ds.to_netcdf('ukb_how_many_pcs_cca_inSampleCV.nc')
import os
import numpy as np
import xarray as xr
from sklearn.model_selection import cross_val_score
from tqdm import trange
from gemmr.estimators import SVDPLS
from gemmr.estimators.helpers import cov_transform_scorer
_ds = xr.open_dataset(
os.path.expanduser('~/gemmr_data/empirical/ukb_fmri_smithpreproc_pcs.nc')
)
ukbb_fMRI_X, ukbb_fMRI_Y = _ds['X_pcs'].values, _ds['Y_pcs'].values
pls_covs = np.nan * np.empty((100, 100))
pls_covs_cv = np.nan * np.empty((100, 100))
pls = SVDPLS()
for i in trange(1, 100):
for j in trange(1, 100, leave=False):
pls_covs[i,j] = pls.fit(ukbb_fMRI_X[:, :i], ukbb_fMRI_Y[:, :j]).assocs_[0]
pls_covs_cv[i, j] = cross_val_score(
pls, ukbb_fMRI_X[:, :i], ukbb_fMRI_Y[:, :j], cv=5,
scoring=cov_transform_scorer, n_jobs=None
).mean()
ds = xr.Dataset(dict(
pls_covs=xr.DataArray(pls_covs, dims=('n_X_pcs', 'n_Y_pcs')),
pls_covs_cv=xr.DataArray(pls_covs_cv, dims=('n_X_pcs', 'n_Y_pcs'))
))
ds.to_netcdf('ukb_how_many_pcs_pls_inSampleCV.nc')
The previous code segments store their outputs in files for later usage. Now load these previously generated data:
[2]:
ds_cca = xr.load_dataset('ukb_how_many_pcs_cca_inSampleCV.nc')
ds_pls = xr.load_dataset('ukb_how_many_pcs_pls_inSampleCV.nc')
[3]:
cca_rs, cca_rs_cv = ds_cca.cca_rs.values, ds_cca.cca_rs_cv.values
pls_covs, pls_covs_cv = ds_pls.pls_covs.values, ds_pls.pls_covs_cv.values
Results
[8]:
fig = (
hv.Raster(cca_rs[1:, 1:]).relabel('CCA').opts(clabel='Canonical correlation', xlabel='')
+ hv.Raster(cca_rs_cv[1:, 1:]).relabel('CCA (cross-validated)').opts(clabel='Canonical correlation', xlabel='', ylabel='')
+ hv.Raster(pls_covs[1:, 1:] / 3500).relabel('PLS').opts(clabel='PLS covariance / a.u.')
+ hv.Raster(pls_covs_cv[1:, 1:] / 3500).relabel('PLS (cross-validated)').opts(clabel='PLS covariance / a.u.', ylabel='')
).redim(
x='# top behav. PCs retained',
y='# top neuroim. PCs retained'
).cols(
2
).opts(*fig_opts).opts(
opts.Raster(colorbar=True, cmap='viridis', xticks=[0, 20, 40, 60, 80], invert_yaxis=True),
opts.Layout(hspace=.6, vspace=.45, sublabel_position=(-.28, .95))
)
hv.save(fig, "fig/figS_cca_pls_assocStrength_dependingOnNRetainedPCs.pdf")
fig
[8]: