Dimensionality Reduction Utilities

Flexible utilities for PCA, MDS, and biplots visualizations.

The Plotter Class

This class provides a unified framework for dimensionality reduction techniques, with a focus on visualization


source

DimensionalityReductionPlotter

 DimensionalityReductionPlotter (in_df:pandas.core.frame.DataFrame,
                                 top:int=500, color_dictionary:Optional[Di
                                 ct[str,str]]=None)

A class for creating dimensionality reduction plots (PCA, MDS) from pandas DataFrames.

# Create synthetic data
np.random.seed(42)

# Number of samples and features
n_samples = 50
n_features = 100

# Generate random data with two distinct groups and some structure
data = np.random.randn(n_features, n_samples)

# Add some structure - make the first 30 features higher in first 25 samples
data[:30, :25] += 2

# Create a pandas DataFrame
feature_names = [f'feature_{i}' for i in range(n_features)]
sample_names = [f'sample_{i}' for i in range(n_samples)]
df = pd.DataFrame(data, index=feature_names, columns=sample_names)

# Create sample groups - mapping each sample to its color
sample_groups = {}
for i in range(n_samples):
    if i < 25:
        sample_groups[f'sample_{i}'] = 'red'
    else:
        sample_groups[f'sample_{i}'] = 'blue'

# Create feature groups - mapping each feature to its color
feature_groups = {}
for i in range(n_features):
    if i < 30:
        feature_groups[f'feature_{i}'] = 'green'
    elif i < 60:
        feature_groups[f'feature_{i}'] = 'purple'
    else:
        feature_groups[f'feature_{i}'] = 'orange'

# Create a color dictionary for nice labels
color_dict = {
    'red': 'Group A',
    'blue': 'Group B',
    'green': 'Gene Set 1',
    'purple': 'Gene Set 2',
    'orange': 'Gene Set 3'
}

# Preview the data
df.iloc[:5, :5]
sample_0 sample_1 sample_2 sample_3 sample_4
feature_0 2.496714 1.861736 2.647689 3.523030 1.765847
feature_1 2.324084 1.614918 1.323078 2.611676 3.031000
feature_2 0.584629 1.579355 1.657285 1.197723 1.838714
feature_3 2.250493 2.346448 1.319975 2.232254 2.293072
feature_4 2.357787 2.560785 3.083051 3.053802 0.622331
# Create a plotter instance
plotter = DimensionalityReductionPlotter(
    in_df=df,
    top=50,  # Use top 50 features
    color_dictionary=color_dict
)

# Fit PCA and plot samples
plotter.fit(method='pca', n_components=5)
fig, ax, tmp_df = plotter.plot_samples(
    palette=sample_groups,
    point_size=80,
    do_adjust_text=False,
    title="PCA of Synthetic Data"
)
tmp_df.iloc[:5, :5]
Explained variance ratio: [0.42735199 0.04923835 0.04162556 0.0367032  0.0323377 ]
pc_1 pc_2 pc_3 pc_4 pc_5
sample_0 4.833824 -0.524305 -1.099760 -1.171201 0.348957
sample_1 4.922518 -2.195776 -3.805798 -2.724017 -2.896950
sample_2 5.531237 2.777291 -0.293219 -2.368883 -0.057843
sample_3 7.213643 1.604138 -2.255550 -1.751800 2.053669
sample_4 5.982604 -0.348769 -0.019634 1.754475 0.805486

# Plot the feature loadings as arrows
fig, ax, tmp_df  = plotter.plot_loadings(
    palette=feature_groups,
    arrow=True,
    arrow_scale=3,
    title="PCA Feature Loadings",
    biggest=2
)
tmp_df.iloc[:5, :5]
pc_1 pc_2 pc_3 pc_4 pc_5
feature_56 -0.009073 -0.075461 0.134354 -0.212345 -0.081578
feature_70 -0.037480 0.082879 -0.150355 -0.106533 0.243716
feature_99 -0.016096 0.101830 -0.366043 -0.102091 -0.031489
feature_71 -0.014748 0.209104 0.073929 -0.087332 0.088316
feature_94 0.039709 -0.044942 0.003832 -0.093297 0.106068

# Create a biplot
fig, ax, tmp_df_dict = plotter.plot_biplot(
    feature_palette=feature_groups,
    sample_palette=sample_groups,
    arrow_scale=4,
    sample_size=70,
    title="PCA Biplot of Synthetic Data",
    #biggest=2
)
#tmp_df.iloc[:5, :5]
print(tmp_df_dict['samples'].iloc[:5, :5])
print(tmp_df_dict['features'].iloc[:5, :5])
              pc_1      pc_2      pc_3      pc_4      pc_5
sample_0  4.833824 -0.524305 -1.099760 -1.171201  0.348957
sample_1  4.922518 -2.195776 -3.805798 -2.724017 -2.896950
sample_2  5.531237  2.777291 -0.293219 -2.368883 -0.057843
sample_3  7.213643  1.604138 -2.255550 -1.751800  2.053669
sample_4  5.982604 -0.348769 -0.019634  1.754475  0.805486
                pc_1      pc_2      pc_3      pc_4      pc_5
feature_56 -0.009073 -0.075461  0.134354 -0.212345 -0.081578
feature_70 -0.037480  0.082879 -0.150355 -0.106533  0.243716
feature_99 -0.016096  0.101830 -0.366043 -0.102091 -0.031489
feature_71 -0.014748  0.209104  0.073929 -0.087332  0.088316
feature_94  0.039709 -0.044942  0.003832 -0.093297  0.106068

# Cumulative explained variance
fig, ax, tmp_df = plotter.plot_explained_variance(
    cumulative=True,
    color="#9B1D20"
)
tmp_df.head()
component explained_variance cumulative_variance
0 1 42.735199 42.735199
1 2 4.923835 47.659034
2 3 4.162556 51.821590
3 4 3.670320 55.491911
4 5 3.233770 58.725681

# Individual explained variance
fig, ax, tmp_df = plotter.plot_explained_variance(
    cumulative=False,
    color="#1E88E5",
    title="Individual Variance Explained per Principal Component"
)
tmp_df.head()
component explained_variance cumulative_variance
0 1 42.735199 42.735199
1 2 4.923835 47.659034
2 3 4.162556 51.821590
3 4 3.670320 55.491911
4 5 3.233770 58.725681

# Switch to MDS
plotter.fit(method='mds', metric=True, random_state=42)

# Plot MDS results
fig, ax, tmp_df = plotter.plot_samples(
    palette=sample_groups,
    point_size=80,
    title="MDS of Synthetic Data"
)
tmp_df.head()
/Users/MTinti/miniconda3/envs/work3/lib/python3.10/site-packages/sklearn/manifold/_mds.py:517: UserWarning: The MDS API has changed. ``fit`` now constructs an dissimilarity matrix from data. To use a custom dissimilarity matrix, set ``dissimilarity='precomputed'``.
  warnings.warn(
dim_1 dim_2 color
sample_0 -2.463140 4.037141 red
sample_1 -8.969954 3.636025 red
sample_2 0.072362 6.270072 red
sample_3 -3.341935 10.120119 red
sample_4 -3.417970 6.328873 red


source

create_dim_reduction_dashboard

 create_dim_reduction_dashboard (in_df, sample_palette, feature_palette,
                                 top=50, color_dictionary=None,
                                 n_components=5, title='Dimensionality
                                 Reduction Dashboard', figsize=(16, 12))

Create a comprehensive 2x2 dashboard of dimensionality reduction visualizations.

# Create the dashboard
fig, axes, results_dict = create_dim_reduction_dashboard(
    in_df=df,
    sample_palette=sample_groups,
    feature_palette=feature_groups,
    top=50,
    color_dictionary=color_dict,
    title="Synthetic Data Analysis Dashboard"
)

# Fine tune the figure if needed
plt.tight_layout(rect=[0, 0.03, 1, 0.95])  # Adjust layout to accommodate suptitle and caption

# Now you have access to all the DataFrames for further analysis
print("Available DataFrames in results_dict:")
for key in results_dict:
    print(f"- {key}: {results_dict[key].shape}")
    
# Example of further analysis with the returned DataFrames
print("\nExamined variance explained by first 3 components:")
print(results_dict['explained_variance'].head(3))
Explained variance ratio: [0.42735199 0.04923835 0.04162556 0.0367032  0.0323377 ]
/Users/MTinti/miniconda3/envs/work3/lib/python3.10/site-packages/sklearn/manifold/_mds.py:517: UserWarning: The MDS API has changed. ``fit`` now constructs an dissimilarity matrix from data. To use a custom dissimilarity matrix, set ``dissimilarity='precomputed'``.
  warnings.warn(
Available DataFrames in results_dict:
- pca_samples: (50, 6)
- pca_loadings: (50, 5)
- explained_variance: (5, 3)
- mds_samples: (50, 3)

Examined variance explained by first 3 components:
   component  explained_variance  cumulative_variance
0          1           42.735199            42.735199
1          2            4.923835            47.659034
2          3            4.162556            51.821590