PC bias¶

We show that PLS weights resemble the 1st principal component axis.

Setup¶

[1]:

import numpy as np
import xarray as xr
import pandas as pd
import scipy.stats

from gemmr.data import load_outcomes, print_ds_stats
from gemmr.sample_analysis import *
from gemmr.plot import *

import matplotlib
import matplotlib.patches as mpatches
from matplotlib.lines import Line2D
import matplotlib.patheffects as path_effects

import holoviews as hv
from holoviews import opts
hv.extension('matplotlib')
hv.renderer('matplotlib').set_param(dpi=120)

from my_config import *

import warnings
warnings.filterwarnings(
    'ignore', 'aspect is not supported for Axes with xscale=log, yscale=linear', category=UserWarning
)  # holoviews emits this for log-linear plots

/Users/mdh56/Projects/gemmr/gemmr/sample_analysis/analyzers.py:13: TqdmExperimentalWarning: Using `tqdm.autonotebook.tqdm` in notebook mode. Use `tqdm.tqdm` instead to force console mode (e.g. in jupyter console)
  from tqdm.autonotebook import tqdm

[2]:

r_clrs = hv.Palette(cmap_r, samples=3).values
n_per_ftr_clrs = hv.Palette(cmap_n_per_ftr, samples=4).values
clr_random = 'darkslateblue'
clr_perm = 'gray'

2 features¶

First, we look into a toy example where both datasets have only 2 features. In this case, weight vectors are two-dimensional unit vectors and can therefore be illustrated as points on a circle.

[ ]:

n_rep = 10000

ds_cca_2 = analyze_model_parameters(
    'cca',
    pxs=[2],
    py='px',
    n_per_ftrs=[n_per_ftr_typical, 10*n_per_ftr_typical],
    rs=[.3],
    axPlusay_range=(0, 0),
    n_Sigmas=1,
    n_rep=n_rep,
)

ds_pls_2 = analyze_model_parameters(
    'pls',
    pxs=[2],
    py='px',
    n_per_ftrs=[.5, n_per_ftr_typical, 10*n_per_ftr_typical],
    rs=[.3],
    axPlusay_range=(-2, -2),
    n_Sigmas=1,
    n_rep=n_rep,
)

dss_2 = dict(cca=ds_cca_2, pls=ds_pls_2)

[4]:

def calc_angles(da):
    return xr.apply_ufunc(
        lambda x: np.arctan(x[1] / x[0]) + np.pi/2,
        da,
        input_core_dims=[['x_feature']],
        vectorize=True
    )


def directional_mean(angles):
    R = np.exp(2j*angles).mean()
    mn = np.angle(R) / 2#np.arctan(R.imag / R.real)# / 2
    if mn < 0:
        mn = np.pi + mn
    return mn

[5]:

thetas = np.linspace(0, np.pi, 18)
panels_weights_2 = dict(cca=hv.Overlay(), pls=hv.Overlay())


def samples_per_feature_legend(plot, element):
    ax = plot.handles['axis']
    kwargs = dict(ha='center', fontsize=8, transform=ax.transAxes)
    ax.text(1.35, .2, 'samples\nper feature', color='black', **kwargs)
    ax.text(1.35, .45, '50', color=n_per_ftr_clrs[2], **kwargs)
    ax.text(1.35, .575, '5', color=n_per_ftr_clrs[1], **kwargs)
    ax.text(1.35, .7, '0.5', color=n_per_ftr_clrs[0],
            path_effects=[path_effects.Stroke(linewidth=.1, foreground='darkgrey'), path_effects.Normal()], **kwargs)



def clip_off(plot, element):
    artist = plot.handles['artist']
    artist.set_clip_on(False)


def cca_labels(plot, element):
    ax = plot.handles['axis']

mx_r = 4250


for model in ['cca', 'pls']:

    _ds = dss_2[model].sel(mode=0, Sigma_id=0, px=2, r=0.3)
    angle_true = float(calc_angles(_ds.x_weights_true))
    angles_estim = calc_angles(_ds.x_weights)
    panels_weights_2[model] *= hv.VLine(angle_true).opts(color='black')
    panels_weights_2[model] *= hv.Text(angle_true, 1.25*mx_r, 'true', rotation=180*angle_true/np.pi-90, halign='center', valign='center', fontsize=8)

    for n_per_ftr in _ds.n_per_ftr[::-1].values:
        angles_ = angles_estim.sel(n_per_ftr=n_per_ftr)
        panels_weights_2[model] *= polar_hist(angles_).relabel('n/ftr=%s' % str(n_per_ftr))

        mn_angle = directional_mean(angles_.values)
        if model == 'cca':
            delta_r = n_per_ftr*10
        else:
            delta_r = 0
        panels_weights_2[model] *= hv.Scatter(([mn_angle], [1*mx_r + delta_r])).opts(hooks=[clip_off])

    panels_weights_2[model] *= hv.Text(np.pi+.25, 3000, 'PC2', fontsize=8)
    panels_weights_2[model] *= hv.VLine(np.pi/2).opts(color='black', linestyle='--', linewidth=1)
    panels_weights_2[model] *= hv.Text(np.pi/2, 1.15*mx_r, 'PC1', fontsize=8, halign='center', valign='center')

    panels_weights_2[model] *= hv.Text(np.pi/2, 1.35*mx_r, '$r_\mathrm{true}=0.3$, 2 ftrs/set', halign='center', valign='bottom', fontsize=8)


fig_weights_2 = (
    panels_weights_2['cca'].opts(hooks=[samples_per_feature_legend, suptitle_cca])
    + panels_weights_2['pls'].opts(hooks=[suptitle_pls])
).redim(
    x='r_hist',
    y='angle_hist'
).opts(*fig_opts).opts(
    #opts.Histogram(projection='polar', facecolor=(0, 0, 0, 0), edgecolor=hv.Palette('cool')),
    opts.Curve(color=hv.Cycle(n_per_ftr_clrs[:-1][::-1])),
    opts.Scatter(s=15, color=hv.Cycle(n_per_ftr_clrs[:-1][::-1])),
    opts.Overlay(sublabel_position=(-.35, .75), xaxis='bare', yaxis='bare', ylim=(0, mx_r), xlim=(0, np.pi), show_legend=False),
    #opts.Layout(fig_inches=(3.42, 1))
)

fig_weights_2

[5]:

Load data for following analyses¶

[6]:

res = dict(
    cca=load_outcomes('cca', tag='pcBias'),
    pls=load_outcomes('pls', tag='pcBias_axPlusay-2')
)

What’s in the outcome data files?

[7]:

print_ds_stats(res['cca'])

n_modes          1
n_rep            100
n_perm           10
n_per_ftr        [200 128  64  32  16   8   5   3]
r                [0.1 0.3 0.5]
px               [64 32 16  8  4  2]
ax+ay range     (0.00, 0.00)
py              == px

<xarray.DataArray 'n_Sigmas' (px: 6, r: 3)>
array([[10, 10, 10],
       [10, 10, 10],
       [10, 10, 10],
       [10, 10, 10],
       [10, 10, 10],
       [10, 10, 10]])
Coordinates:
  * r        (r) float64 0.1 0.3 0.5
  * px       (px) int64 64 32 16 8 4 2

power           not calculated

[8]:

print_ds_stats(res['pls'])

n_modes          1
n_rep            100
n_perm           10
n_per_ftr        [200 128  64  32  16   8   5   3]
r                [0.1 0.3 0.5]
px               [64 32 16  8  4  2]
ax+ay range     (-2.00, -2.00)
py              == px

<xarray.DataArray 'n_Sigmas' (px: 6, r: 3)>
array([[10, 10, 10],
       [10, 10, 10],
       [10, 10, 10],
       [10, 10, 10],
       [10, 10, 10],
       [10, 10, 10]])
Coordinates:
  * r        (r) float64 0.1 0.3 0.5
  * px       (px) int64 64 32 16 8 4 2

power           not calculated

How do weight vectors look like in the principal component coordinate system?¶

[11]:

def plot_weight_pc_cossim(ds, n_pcs=None, color=None):

    if n_pcs is None:
        n_pcs = ds.px.values.max()

    sim = np.abs(ds.x_weights_pc_cossim.sel(px=n_pcs))
    sim_perm = np.abs(ds.x_weights_pc_cossim_perm.sel(px=n_pcs))

    other_dims = [d for d in sim.dims if d != 'x_feature']
    sim = sim.stack(it=other_dims).mean('it')
    sim_perm = sim_perm.stack(it=other_dims + ['perm']).mean('it')

    random_data = np.abs(scipy.stats.beta.rvs((n_pcs-1)/2, (n_pcs-1)/2, size=(10000)) * 2 - 1)

    panel = (
        hv.HLine(random_data.mean())
        * hv.Curve(
            (np.arange(1, len(sim_perm)+1), sim_perm.values),
            label=r'$\langle$null$\rangle$').opts(linestyle='--')
        * hv.Curve(
            (np.arange(1, len(sim)+1), sim.values),
            label=r'$\langle$synthetic' + '\n' + r'datasets$\rangle$')
    ).opts(
        opts.Curve(color=color, xlim=(1, n_pcs+1)),
        opts.HLine(color=clr_random, linestyle='-', linewidth=1),
    ).redim(
        x='Principal component',
        y='Weight PC bias'
    )
    return panel

[12]:

def hook_weight_pc_cossim_legend(plot, element):

    ax = plot.handles['axis']

    patch_rnd = mpatches.Patch(color=clr_random, label='random')
    patch_2 = mpatches.Patch(color=n_per_ftr_clrs[1], label='{} samples/ftr'.format(n_per_ftr_typical))
    patch_50 = mpatches.Patch(color=n_per_ftr_clrs[-1], label='200 samples/ftr')
    line_synt = Line2D([0], [0], color='black', linewidth=1, linestyle='-', label=r'$\langle$synth$\rangle$')
    line_null = Line2D([0], [0], color='black', linewidth=1, linestyle='--', label=r'$\langle$perm$\rangle$')

    ax.legend(handles=[patch_rnd, patch_2, patch_50, line_synt, line_null], frameon=False, fontsize=7,
              handlelength=1, ncol=2, columnspacing=.5,
              loc='upper right', bbox_to_anchor=(1.475, 1.1))


panels_cca_weight_pc_cossim = (
    plot_weight_pc_cossim(res['cca'].sel(r=0.3, n_per_ftr=200), color=n_per_ftr_clrs[-1])
    * plot_weight_pc_cossim(res['cca'].sel(r=0.3, n_per_ftr=n_per_ftr_typical), color=n_per_ftr_clrs[1])
).relabel(
    r'$r_\mathrm{true}=0.3$, 64 ftrs/set'
).opts(
    hooks=[hook_weight_pc_cossim_legend], logx=True, logy=True, show_legend=True
)


panels_pls_weight_pc_cossim = (
    plot_weight_pc_cossim(res['pls'].sel(r=0.3, n_per_ftr=200), color=n_per_ftr_clrs[-1])
    * plot_weight_pc_cossim(res['pls'].sel(r=0.3, n_per_ftr=n_per_ftr_typical), color=n_per_ftr_clrs[1])
    #* hv.Text(64, 0.5, r'$r_\mathrm{true}=0.3$', fontsize=7, halign='right', valign='top')
    #* hv.Text(64, 0.38, '64 ftrs/set', fontsize=7, halign='right', valign='top')
).relabel(
    r'$r_\mathrm{true}=0.3$, 64 ftrs/set'
).opts(
    show_legend=False, logx=True, logy=True, ylabel=''
)


(
    panels_cca_weight_pc_cossim
    + panels_pls_weight_pc_cossim
).opts(*fig_opts)

[12]:

How strong is the PC bias depending on sample size and true between-set correlation?¶

We define “PC bias” as the mean cosine-similarity between estimated weights and the 1st principal component axis. The mean is taken across repetitions, dimensionalities and different instantiations of the joint covariance matrix.

[14]:

panels_nPerFtrs_cossim = {model: hv.Overlay() for model in res}

for model in res:

    mean_beta_rvs = xr.DataArray(pd.Series(
        {px: np.abs(scipy.stats.beta.rvs((px-1)/2, (px-1)/2, size=(10000)) * 2 - 1).mean()
         for px in res[model].px.values},
    )).rename(dim_0='px')

    delta_pc1bias = np.maximum(
        (np.abs(res[model].x_weights_pc_cossim.sel(x_feature=0)) - mean_beta_rvs).mean('rep').mean('Sigma_id').mean('px'),
        (np.abs(res[model].y_weights_pc_cossim.sel(y_feature=0)) - mean_beta_rvs).mean('rep').mean('Sigma_id').mean('px')
    )

    delta_pc1bias_perm = np.maximum(
        (np.abs(res[model].x_weights_pc_cossim_perm.sel(x_feature=0)) - mean_beta_rvs).mean('rep').mean('Sigma_id').mean('perm').mean('px'),
        (np.abs(res[model].y_weights_pc_cossim_perm.sel(y_feature=0)) - mean_beta_rvs).mean('rep').mean('Sigma_id').mean('perm').mean('px')
    )

    for r in delta_pc1bias.r.values:
        panels_nPerFtrs_cossim[model] *= hv.Curve(delta_pc1bias.sel(r=r), label=r'$r_\mathrm{true}=%.1f$' % r)

    for r in delta_pc1bias_perm.r.values:
        panels_nPerFtrs_cossim[model] *= hv.Curve(delta_pc1bias_perm.sel(r=r), label=r'$r_\mathrm{true}=%.1f$' % r).opts(linestyle='--')

fig_nPerFtrs_cossim = (
    (
        panels_nPerFtrs_cossim['cca']
        * hv.Text(50, .4, r'$r_\mathrm{true}=$', fontsize=7, halign='center')
        * hv.Text(50, .325, '0.1', fontsize=7, halign='center').opts(color=r_clrs[0])
        * hv.Text(50, .25, '0.3', fontsize=7, halign='center').opts(color=r_clrs[1])
        * hv.Text(50, .175, '0.5', fontsize=7, halign='center').opts(color=r_clrs[2])
    ).opts(show_legend=False)
    + panels_nPerFtrs_cossim['pls'].opts(show_legend=False, ylabel='')
).redim(
    n_per_ftr='Samples per feature',
    y=hv.Dimension('cossim_weights_1stPCaxis', label='Avg. weight PC1 bias\n(data - random)')
).opts(*fig_opts).opts(
    opts.Curve(color=hv.Cycle(r_clrs)),
    opts.Overlay(logx=True)
)

fig_nPerFtrs_cossim

[14]:

Figure¶

[17]:

def axpos_middlerow(plot, element):
    ax = plot.handles['axis']
    bbox = ax.get_position()
    ax.set_position((bbox.x0, bbox.y0+.025, bbox.width, bbox.height))


def axpos_bottomrow(plot, element):
    ax = plot.handles['axis']
    bbox = ax.get_position()
    ax.set_position((bbox.x0+.025, .05, .85*bbox.width, .9*bbox.height))


def synth_perm_legend(plot, element):

    ax = plot.handles['axis']

    line_synt = Line2D([0], [0], color='black', linewidth=1, linestyle='-', label=r'synth')
    line_null = Line2D([0], [0], color='black', linewidth=1, linestyle='--', label=r'perm')

    ax.legend(handles=[line_synt, line_null], frameon=False, fontsize=7,
              handlelength=1, ncol=1,
              loc='upper right', bbox_to_anchor=(1.475, 1.03))


fig = (
    fig_weights_2
    + panels_cca_weight_pc_cossim.opts(hooks=[axpos_middlerow, hook_weight_pc_cossim_legend])
    + panels_pls_weight_pc_cossim.opts(hooks=[axpos_middlerow])
    + fig_nPerFtrs_cossim.Overlay.I.opts(opts.Overlay(hooks=[axpos_bottomrow, synth_perm_legend, legend_frame_off]))
    + fig_nPerFtrs_cossim.Overlay.II.opts(opts.Overlay(hooks=[axpos_bottomrow]))
).cols(
    2
).opts(*fig_opts).opts(
    opts.Curve(linewidth=1.5),
    opts.Layout(sublabel_position=(-.35, .95), vspace=.4),
)

hv.save(fig, 'fig/fig4_pc_bias.pdf')

fig

[17]: