PC bias¶
We show that PLS weights resemble the 1st principal component axis.
Setup¶
[1]:
import numpy as np
import xarray as xr
import pandas as pd
import scipy.stats
from gemmr.data import load_outcomes, print_ds_stats
from gemmr.sample_analysis import *
from gemmr.plot import *
import matplotlib
import matplotlib.patches as mpatches
from matplotlib.lines import Line2D
import matplotlib.patheffects as path_effects
import holoviews as hv
from holoviews import opts
hv.extension('matplotlib')
hv.renderer('matplotlib').set_param(dpi=120)
from my_config import *
import warnings
warnings.filterwarnings(
'ignore', 'aspect is not supported for Axes with xscale=log, yscale=linear', category=UserWarning
) # holoviews emits this for log-linear plots
/Users/mdh56/Projects/gemmr/gemmr/sample_analysis/analyzers.py:13: TqdmExperimentalWarning: Using `tqdm.autonotebook.tqdm` in notebook mode. Use `tqdm.tqdm` instead to force console mode (e.g. in jupyter console)
from tqdm.autonotebook import tqdm
[2]:
r_clrs = hv.Palette(cmap_r, samples=3).values
n_per_ftr_clrs = hv.Palette(cmap_n_per_ftr, samples=4).values
clr_random = 'darkslateblue'
clr_perm = 'gray'
2 features¶
First, we look into a toy example where both datasets have only 2 features. In this case, weight vectors are two-dimensional unit vectors and can therefore be illustrated as points on a circle.
[ ]:
n_rep = 10000
ds_cca_2 = analyze_model_parameters(
'cca',
pxs=[2],
py='px',
n_per_ftrs=[n_per_ftr_typical, 10*n_per_ftr_typical],
rs=[.3],
axPlusay_range=(0, 0),
n_Sigmas=1,
n_rep=n_rep,
)
ds_pls_2 = analyze_model_parameters(
'pls',
pxs=[2],
py='px',
n_per_ftrs=[.5, n_per_ftr_typical, 10*n_per_ftr_typical],
rs=[.3],
axPlusay_range=(-2, -2),
n_Sigmas=1,
n_rep=n_rep,
)
dss_2 = dict(cca=ds_cca_2, pls=ds_pls_2)
[4]:
def calc_angles(da):
return xr.apply_ufunc(
lambda x: np.arctan(x[1] / x[0]) + np.pi/2,
da,
input_core_dims=[['x_feature']],
vectorize=True
)
def directional_mean(angles):
R = np.exp(2j*angles).mean()
mn = np.angle(R) / 2#np.arctan(R.imag / R.real)# / 2
if mn < 0:
mn = np.pi + mn
return mn
[5]:
thetas = np.linspace(0, np.pi, 18)
panels_weights_2 = dict(cca=hv.Overlay(), pls=hv.Overlay())
def samples_per_feature_legend(plot, element):
ax = plot.handles['axis']
kwargs = dict(ha='center', fontsize=8, transform=ax.transAxes)
ax.text(1.35, .2, 'samples\nper feature', color='black', **kwargs)
ax.text(1.35, .45, '50', color=n_per_ftr_clrs[2], **kwargs)
ax.text(1.35, .575, '5', color=n_per_ftr_clrs[1], **kwargs)
ax.text(1.35, .7, '0.5', color=n_per_ftr_clrs[0],
path_effects=[path_effects.Stroke(linewidth=.1, foreground='darkgrey'), path_effects.Normal()], **kwargs)
def clip_off(plot, element):
artist = plot.handles['artist']
artist.set_clip_on(False)
def cca_labels(plot, element):
ax = plot.handles['axis']
mx_r = 4250
for model in ['cca', 'pls']:
_ds = dss_2[model].sel(mode=0, Sigma_id=0, px=2, r=0.3)
angle_true = float(calc_angles(_ds.x_weights_true))
angles_estim = calc_angles(_ds.x_weights)
panels_weights_2[model] *= hv.VLine(angle_true).opts(color='black')
panels_weights_2[model] *= hv.Text(angle_true, 1.25*mx_r, 'true', rotation=180*angle_true/np.pi-90, halign='center', valign='center', fontsize=8)
for n_per_ftr in _ds.n_per_ftr[::-1].values:
angles_ = angles_estim.sel(n_per_ftr=n_per_ftr)
panels_weights_2[model] *= polar_hist(angles_).relabel('n/ftr=%s' % str(n_per_ftr))
mn_angle = directional_mean(angles_.values)
if model == 'cca':
delta_r = n_per_ftr*10
else:
delta_r = 0
panels_weights_2[model] *= hv.Scatter(([mn_angle], [1*mx_r + delta_r])).opts(hooks=[clip_off])
panels_weights_2[model] *= hv.Text(np.pi+.25, 3000, 'PC2', fontsize=8)
panels_weights_2[model] *= hv.VLine(np.pi/2).opts(color='black', linestyle='--', linewidth=1)
panels_weights_2[model] *= hv.Text(np.pi/2, 1.15*mx_r, 'PC1', fontsize=8, halign='center', valign='center')
panels_weights_2[model] *= hv.Text(np.pi/2, 1.35*mx_r, '$r_\mathrm{true}=0.3$, 2 ftrs/set', halign='center', valign='bottom', fontsize=8)
fig_weights_2 = (
panels_weights_2['cca'].opts(hooks=[samples_per_feature_legend, suptitle_cca])
+ panels_weights_2['pls'].opts(hooks=[suptitle_pls])
).redim(
x='r_hist',
y='angle_hist'
).opts(*fig_opts).opts(
#opts.Histogram(projection='polar', facecolor=(0, 0, 0, 0), edgecolor=hv.Palette('cool')),
opts.Curve(color=hv.Cycle(n_per_ftr_clrs[:-1][::-1])),
opts.Scatter(s=15, color=hv.Cycle(n_per_ftr_clrs[:-1][::-1])),
opts.Overlay(sublabel_position=(-.35, .75), xaxis='bare', yaxis='bare', ylim=(0, mx_r), xlim=(0, np.pi), show_legend=False),
#opts.Layout(fig_inches=(3.42, 1))
)
fig_weights_2
[5]:
Load data for following analyses¶
[6]:
res = dict(
cca=load_outcomes('cca', tag='pcBias'),
pls=load_outcomes('pls', tag='pcBias_axPlusay-2')
)
What’s in the outcome data files?
[7]:
print_ds_stats(res['cca'])
n_modes 1
n_rep 100
n_perm 10
n_per_ftr [200 128 64 32 16 8 5 3]
r [0.1 0.3 0.5]
px [64 32 16 8 4 2]
ax+ay range (0.00, 0.00)
py == px
<xarray.DataArray 'n_Sigmas' (px: 6, r: 3)>
array([[10, 10, 10],
[10, 10, 10],
[10, 10, 10],
[10, 10, 10],
[10, 10, 10],
[10, 10, 10]])
Coordinates:
* r (r) float64 0.1 0.3 0.5
* px (px) int64 64 32 16 8 4 2
power not calculated
[8]:
print_ds_stats(res['pls'])
n_modes 1
n_rep 100
n_perm 10
n_per_ftr [200 128 64 32 16 8 5 3]
r [0.1 0.3 0.5]
px [64 32 16 8 4 2]
ax+ay range (-2.00, -2.00)
py == px
<xarray.DataArray 'n_Sigmas' (px: 6, r: 3)>
array([[10, 10, 10],
[10, 10, 10],
[10, 10, 10],
[10, 10, 10],
[10, 10, 10],
[10, 10, 10]])
Coordinates:
* r (r) float64 0.1 0.3 0.5
* px (px) int64 64 32 16 8 4 2
power not calculated
How do weight vectors look like in the principal component coordinate system?¶
[11]:
def plot_weight_pc_cossim(ds, n_pcs=None, color=None):
if n_pcs is None:
n_pcs = ds.px.values.max()
sim = np.abs(ds.x_weights_pc_cossim.sel(px=n_pcs))
sim_perm = np.abs(ds.x_weights_pc_cossim_perm.sel(px=n_pcs))
other_dims = [d for d in sim.dims if d != 'x_feature']
sim = sim.stack(it=other_dims).mean('it')
sim_perm = sim_perm.stack(it=other_dims + ['perm']).mean('it')
random_data = np.abs(scipy.stats.beta.rvs((n_pcs-1)/2, (n_pcs-1)/2, size=(10000)) * 2 - 1)
panel = (
hv.HLine(random_data.mean())
* hv.Curve(
(np.arange(1, len(sim_perm)+1), sim_perm.values),
label=r'$\langle$null$\rangle$').opts(linestyle='--')
* hv.Curve(
(np.arange(1, len(sim)+1), sim.values),
label=r'$\langle$synthetic' + '\n' + r'datasets$\rangle$')
).opts(
opts.Curve(color=color, xlim=(1, n_pcs+1)),
opts.HLine(color=clr_random, linestyle='-', linewidth=1),
).redim(
x='Principal component',
y='Weight PC bias'
)
return panel
[12]:
def hook_weight_pc_cossim_legend(plot, element):
ax = plot.handles['axis']
patch_rnd = mpatches.Patch(color=clr_random, label='random')
patch_2 = mpatches.Patch(color=n_per_ftr_clrs[1], label='{} samples/ftr'.format(n_per_ftr_typical))
patch_50 = mpatches.Patch(color=n_per_ftr_clrs[-1], label='200 samples/ftr')
line_synt = Line2D([0], [0], color='black', linewidth=1, linestyle='-', label=r'$\langle$synth$\rangle$')
line_null = Line2D([0], [0], color='black', linewidth=1, linestyle='--', label=r'$\langle$perm$\rangle$')
ax.legend(handles=[patch_rnd, patch_2, patch_50, line_synt, line_null], frameon=False, fontsize=7,
handlelength=1, ncol=2, columnspacing=.5,
loc='upper right', bbox_to_anchor=(1.475, 1.1))
panels_cca_weight_pc_cossim = (
plot_weight_pc_cossim(res['cca'].sel(r=0.3, n_per_ftr=200), color=n_per_ftr_clrs[-1])
* plot_weight_pc_cossim(res['cca'].sel(r=0.3, n_per_ftr=n_per_ftr_typical), color=n_per_ftr_clrs[1])
).relabel(
r'$r_\mathrm{true}=0.3$, 64 ftrs/set'
).opts(
hooks=[hook_weight_pc_cossim_legend], logx=True, logy=True, show_legend=True
)
panels_pls_weight_pc_cossim = (
plot_weight_pc_cossim(res['pls'].sel(r=0.3, n_per_ftr=200), color=n_per_ftr_clrs[-1])
* plot_weight_pc_cossim(res['pls'].sel(r=0.3, n_per_ftr=n_per_ftr_typical), color=n_per_ftr_clrs[1])
#* hv.Text(64, 0.5, r'$r_\mathrm{true}=0.3$', fontsize=7, halign='right', valign='top')
#* hv.Text(64, 0.38, '64 ftrs/set', fontsize=7, halign='right', valign='top')
).relabel(
r'$r_\mathrm{true}=0.3$, 64 ftrs/set'
).opts(
show_legend=False, logx=True, logy=True, ylabel=''
)
(
panels_cca_weight_pc_cossim
+ panels_pls_weight_pc_cossim
).opts(*fig_opts)
[12]:
How strong is the PC bias depending on sample size and true between-set correlation?¶
We define “PC bias” as the mean cosine-similarity between estimated weights and the 1st principal component axis. The mean is taken across repetitions, dimensionalities and different instantiations of the joint covariance matrix.
[14]:
panels_nPerFtrs_cossim = {model: hv.Overlay() for model in res}
for model in res:
mean_beta_rvs = xr.DataArray(pd.Series(
{px: np.abs(scipy.stats.beta.rvs((px-1)/2, (px-1)/2, size=(10000)) * 2 - 1).mean()
for px in res[model].px.values},
)).rename(dim_0='px')
delta_pc1bias = np.maximum(
(np.abs(res[model].x_weights_pc_cossim.sel(x_feature=0)) - mean_beta_rvs).mean('rep').mean('Sigma_id').mean('px'),
(np.abs(res[model].y_weights_pc_cossim.sel(y_feature=0)) - mean_beta_rvs).mean('rep').mean('Sigma_id').mean('px')
)
delta_pc1bias_perm = np.maximum(
(np.abs(res[model].x_weights_pc_cossim_perm.sel(x_feature=0)) - mean_beta_rvs).mean('rep').mean('Sigma_id').mean('perm').mean('px'),
(np.abs(res[model].y_weights_pc_cossim_perm.sel(y_feature=0)) - mean_beta_rvs).mean('rep').mean('Sigma_id').mean('perm').mean('px')
)
for r in delta_pc1bias.r.values:
panels_nPerFtrs_cossim[model] *= hv.Curve(delta_pc1bias.sel(r=r), label=r'$r_\mathrm{true}=%.1f$' % r)
for r in delta_pc1bias_perm.r.values:
panels_nPerFtrs_cossim[model] *= hv.Curve(delta_pc1bias_perm.sel(r=r), label=r'$r_\mathrm{true}=%.1f$' % r).opts(linestyle='--')
fig_nPerFtrs_cossim = (
(
panels_nPerFtrs_cossim['cca']
* hv.Text(50, .4, r'$r_\mathrm{true}=$', fontsize=7, halign='center')
* hv.Text(50, .325, '0.1', fontsize=7, halign='center').opts(color=r_clrs[0])
* hv.Text(50, .25, '0.3', fontsize=7, halign='center').opts(color=r_clrs[1])
* hv.Text(50, .175, '0.5', fontsize=7, halign='center').opts(color=r_clrs[2])
).opts(show_legend=False)
+ panels_nPerFtrs_cossim['pls'].opts(show_legend=False, ylabel='')
).redim(
n_per_ftr='Samples per feature',
y=hv.Dimension('cossim_weights_1stPCaxis', label='Avg. weight PC1 bias\n(data - random)')
).opts(*fig_opts).opts(
opts.Curve(color=hv.Cycle(r_clrs)),
opts.Overlay(logx=True)
)
fig_nPerFtrs_cossim
[14]:
Figure¶
[17]:
def axpos_middlerow(plot, element):
ax = plot.handles['axis']
bbox = ax.get_position()
ax.set_position((bbox.x0, bbox.y0+.025, bbox.width, bbox.height))
def axpos_bottomrow(plot, element):
ax = plot.handles['axis']
bbox = ax.get_position()
ax.set_position((bbox.x0+.025, .05, .85*bbox.width, .9*bbox.height))
def synth_perm_legend(plot, element):
ax = plot.handles['axis']
line_synt = Line2D([0], [0], color='black', linewidth=1, linestyle='-', label=r'synth')
line_null = Line2D([0], [0], color='black', linewidth=1, linestyle='--', label=r'perm')
ax.legend(handles=[line_synt, line_null], frameon=False, fontsize=7,
handlelength=1, ncol=1,
loc='upper right', bbox_to_anchor=(1.475, 1.03))
fig = (
fig_weights_2
+ panels_cca_weight_pc_cossim.opts(hooks=[axpos_middlerow, hook_weight_pc_cossim_legend])
+ panels_pls_weight_pc_cossim.opts(hooks=[axpos_middlerow])
+ fig_nPerFtrs_cossim.Overlay.I.opts(opts.Overlay(hooks=[axpos_bottomrow, synth_perm_legend, legend_frame_off]))
+ fig_nPerFtrs_cossim.Overlay.II.opts(opts.Overlay(hooks=[axpos_bottomrow]))
).cols(
2
).opts(*fig_opts).opts(
opts.Curve(linewidth=1.5),
opts.Layout(sublabel_position=(-.35, .95), vspace=.4),
)
hv.save(fig, 'fig/fig4_pc_bias.pdf')
fig
[17]: