# Example usage with different palette options
def demonstrate_group_mapping_palettes():
# Create a list of items
= [f'gene_{i}' for i in range(15)]
items
# Create a figure with multiple palette examples
= plt.subplots(4, 1, figsize=(12, 10))
fig, axes
# Example 1: Default palette
= create_group_color_mapping(
color_map1, group_map1 =3, return_color_to_group=True
items, group_size
)
# Example 2: tab10 palette
= create_group_color_mapping(
color_map2, group_map2 =3, palette_name='tab10', return_color_to_group=True
items, group_size
)
# Example 3: Set2 palette
= create_group_color_mapping(
color_map3, group_map3 =3, palette_name='Set2', return_color_to_group=True
items, group_size
)
# Example 4: viridis palette
= create_group_color_mapping(
color_map4, group_map4 =3, palette_name='viridis', return_color_to_group=True
items, group_size
)
# Plot all examples
= [
palettes 'Default Palette', color_map1, group_map1),
('tab10 Palette', color_map2, group_map2),
('Set2 Palette', color_map3, group_map3),
('viridis Palette', color_map4, group_map4)
(
]
for i, (title, color_map, group_map) in enumerate(palettes):
= axes[i]
ax
# Plot bars
for j, item in enumerate(items):
0, 0.8, left=j, height=0.8, color=color_map[item], alpha=0.7)
ax.barh(if i == 0: # Only add labels on the first plot
+0.4, 0, item, rotation=90, ha='center', va='bottom')
ax.text(j
# Add legend
= [Patch(facecolor=color, label=group) for color, group in group_map.items()]
legend_elements =legend_elements, loc='upper center', ncol=len(group_map))
ax.legend(handles
-0.5, 0.5)
ax.set_ylim(-0.5, len(items) - 0.5)
ax.set_xlim(
ax.set_yticks([])
ax.set_xticks([])
ax.set_title(title)
plt.tight_layout()
# Print example of hex colors from tab10
print("Example hex colors from tab10 palette:")
for color in list(group_map2.keys())[:5]:
print(color)
return fig, axes
Semi Random Collection of functions
convert_palette_to_hex
convert_palette_to_hex (palette_name, n_colors)
Convert a named color palette to hex color codes.
create_group_color_mapping
create_group_color_mapping (items, group_size=3, palette=None, palette_name=None, return_color_to_group=False)
Create a color mapping dictionary that assigns the same color to items in groups.
demonstrate_group_mapping_palettes()
Example hex colors from tab10 palette:
#1f77b4
#ff7f0e
#2ca02c
#d62728
#9467bd
(<Figure size 864x720 with 4 Axes>,
array([<Axes: title={'center': 'Default Palette'}>,
<Axes: title={'center': 'tab10 Palette'}>,
<Axes: title={'center': 'Set2 Palette'}>,
<Axes: title={'center': 'viridis Palette'}>], dtype=object))
norm_loading
norm_loading (df)
*Normalize datasets by equalizing the medians of all columns to a common target value.
This function implements a median normalization strategy that: 1. Calculates the median value for each column in the input dataframe 2. Computes a target value (the mean of all column medians) 3. Derives normalization factors to adjust each column to the target median 4. Applies these normalization factors to create a normalized dataset*
# Set random seed for reproducibility
42)
np.random.seed(
# Generate synthetic data with controlled medians
# Creating a dataset with 100 features (rows) and 6 samples (columns)
# - 3 replicates for condition 1 (lower median)
# - 3 replicates for condition 2 (higher median)
# Number of features (e.g., proteins, genes)
= 100
n_features
# Create condition 1 data (3 replicates with similar distribution)
= np.random.normal(loc=100, scale=25, size=n_features)
condition1_rep1 = np.random.normal(loc=95, scale=20, size=n_features)
condition1_rep2 = np.random.normal(loc=90, scale=22, size=n_features)
condition1_rep3
# Create condition 2 data (3 replicates with higher median)
= np.random.normal(loc=150, scale=30, size=n_features)
condition2_rep1 = np.random.normal(loc=160, scale=28, size=n_features)
condition2_rep2 = np.random.normal(loc=155, scale=32, size=n_features)
condition2_rep3
# Create a DataFrame
= pd.DataFrame({
data 'Cond1_Rep1': condition1_rep1,
'Cond1_Rep2': condition1_rep2,
'Cond1_Rep3': condition1_rep3,
'Cond2_Rep1': condition2_rep1,
'Cond2_Rep2': condition2_rep2,
'Cond2_Rep3': condition2_rep3
})
# Apply the normalization function
= norm_loading(data)
data_normalized
# Set up the figure for visualization
= plt.subplots(1, 2, figsize=(10, 6))
fig, (ax1, ax2)
# Color mapping for conditions
= ['#3498db', '#e74c3c'] # Blue for Condition 1, Red for Condition 2
colors = {
condition_colors 'Cond1_Rep1': colors[0], 'Cond1_Rep2': colors[0], 'Cond1_Rep3': colors[0],
'Cond2_Rep1': colors[1], 'Cond2_Rep2': colors[1], 'Cond2_Rep3': colors[1]
}
# 1. Boxplot for raw data (before normalization)
'Before Normalization', fontsize=14, fontweight='bold')
ax1.set_title(=data, ax=ax1, palette=condition_colors)
sns.boxplot(data# Get current tick positions
= ax1.get_xticks()
ax1_ticks = [label.get_text() for label in ax1.get_xticklabels()]
ax1_labels # Set ticks and then ticklabels
ax1.set_xticks(ax1_ticks)=45, ha='right')
ax1.set_xticklabels(ax1_labels, rotation'Value', fontsize=12)
ax1.set_ylabel(='y', linestyle='--', alpha=0.7)
ax1.grid(axis
# 2. Boxplot for normalized data
'After Normalization', fontsize=14, fontweight='bold')
ax2.set_title(=data_normalized, ax=ax2, palette=condition_colors)
sns.boxplot(data# Get current tick positions
= ax2.get_xticks()
ax2_ticks = [label.get_text() for label in ax2.get_xticklabels()]
ax2_labels # Set ticks and then ticklabels
ax2.set_xticks(ax2_ticks)=45,ha='right')
ax2.set_xticklabels(ax2_labels, rotation'Normalized Value', fontsize=12)
ax2.set_ylabel(='y', linestyle='--', alpha=0.7)
ax2.grid(axis
# Create a custom legend for conditions
from matplotlib.patches import Patch
= [
legend_elements =colors[0], label='Condition 1'),
Patch(facecolor=colors[1], label='Condition 2')
Patch(facecolor
]=legend_elements, loc='upper center', bbox_to_anchor=(0.5, 0.05), ncol=2)
fig.legend(handles
# Adjust layout to make room for annotations
=[0, 0.1, 1, 0.9])
plt.tight_layout(rect plt.show()
medians [ 96.82609271 96.6821434 92.14930633 151.50472099 157.87481208
151.45188746]
target 124.41482716043556
norm_facs [1.28493078 1.28684391 1.35014394 0.82119439 0.78806002 0.82148086]
quantileNormalize
quantileNormalize (df_input, keep_na=True)
*Perform quantile normalization on a pandas DataFrame.
Quantile normalization is a technique that makes the distribution of values for each column identical by transforming the values to match the distribution of the mean of quantiles across all columns.
Algorithm: 1. Sort values in each column independently 2. Calculate the mean across rows of the sorted data (creating a reference distribution) 3. For each original value, assign the corresponding value from the reference distribution based on its rank in its original column*
# Set random seed for reproducibility
42)
np.random.seed(
# Generate synthetic data with controlled medians
# Creating a dataset with 100 features (rows) and 6 samples (columns)
# - 3 replicates for condition 1 (lower median)
# - 3 replicates for condition 2 (higher median)
# Number of features (e.g., proteins, genes)
= 100
n_features
# Create condition 1 data (3 replicates with similar distribution)
= np.random.normal(loc=100, scale=25, size=n_features)
condition1_rep1 = np.random.normal(loc=95, scale=20, size=n_features)
condition1_rep2 = np.random.normal(loc=90, scale=22, size=n_features)
condition1_rep3
# Create condition 2 data (3 replicates with higher median)
= np.random.normal(loc=150, scale=30, size=n_features)
condition2_rep1 = np.random.normal(loc=160, scale=28, size=n_features)
condition2_rep2 = np.random.normal(loc=155, scale=32, size=n_features)
condition2_rep3
# Create a DataFrame
= pd.DataFrame({
data 'Cond1_Rep1': condition1_rep1,
'Cond1_Rep2': condition1_rep2,
'Cond1_Rep3': condition1_rep3,
'Cond2_Rep1': condition2_rep1,
'Cond2_Rep2': condition2_rep2,
'Cond2_Rep3': condition2_rep3
})
# Apply the normalization function
= quantileNormalize(data)
data_normalized
# Set up the figure for visualization
= plt.subplots(1, 2, figsize=(10, 6))
fig, (ax1, ax2)
# Color mapping for conditions
= ['#3498db', '#e74c3c'] # Blue for Condition 1, Red for Condition 2
colors = {
condition_colors 'Cond1_Rep1': colors[0], 'Cond1_Rep2': colors[0], 'Cond1_Rep3': colors[0],
'Cond2_Rep1': colors[1], 'Cond2_Rep2': colors[1], 'Cond2_Rep3': colors[1]
}
# 1. Boxplot for raw data (before normalization)
'Before Normalization', fontsize=14, fontweight='bold')
ax1.set_title(=data, ax=ax1, palette=condition_colors)
sns.boxplot(data# Get current tick positions
= ax1.get_xticks()
ax1_ticks = [label.get_text() for label in ax1.get_xticklabels()]
ax1_labels # Set ticks and then ticklabels
ax1.set_xticks(ax1_ticks)=45, ha='right')
ax1.set_xticklabels(ax1_labels, rotation'Value', fontsize=12)
ax1.set_ylabel(='y', linestyle='--', alpha=0.7)
ax1.grid(axis
# 2. Boxplot for normalized data
'After Normalization', fontsize=14, fontweight='bold')
ax2.set_title(=data_normalized, ax=ax2, palette=condition_colors)
sns.boxplot(data# Get current tick positions
= ax2.get_xticks()
ax2_ticks = [label.get_text() for label in ax2.get_xticklabels()]
ax2_labels # Set ticks and then ticklabels
ax2.set_xticks(ax2_ticks)=45,ha='right')
ax2.set_xticklabels(ax2_labels, rotation'Normalized Value', fontsize=12)
ax2.set_ylabel(='y', linestyle='--', alpha=0.7)
ax2.grid(axis
# Create a custom legend for conditions
from matplotlib.patches import Patch
= [
legend_elements =colors[0], label='Condition 1'),
Patch(facecolor=colors[1], label='Condition 2')
Patch(facecolor
]=legend_elements, loc='upper center', bbox_to_anchor=(0.5, 0.05), ncol=2)
fig.legend(handles
# Adjust layout to make room for annotations
=[0, 0.1, 1, 0.9])
plt.tight_layout(rect plt.show()
norm_loading_TMT
norm_loading_TMT (df)
*Normalize TMT (Tandem Mass Tag) proteomics data to account for uneven sample loading.
This function performs total sum normalization, specifically designed for TMT-based multiplexed proteomics experiments where differences in total protein abundance between samples may be due to technical variations rather than biological differences.*
# Set random seed for reproducibility
42)
np.random.seed(
# Generate synthetic data with controlled medians
# Creating a dataset with 100 features (rows) and 6 samples (columns)
# - 3 replicates for condition 1 (lower median)
# - 3 replicates for condition 2 (higher median)
# Number of features (e.g., proteins, genes)
= 100
n_features
# Create condition 1 data (3 replicates with similar distribution)
= np.random.normal(loc=100, scale=25, size=n_features)
condition1_rep1 = np.random.normal(loc=95, scale=20, size=n_features)
condition1_rep2 = np.random.normal(loc=90, scale=22, size=n_features)
condition1_rep3
# Create condition 2 data (3 replicates with higher median)
= np.random.normal(loc=150, scale=30, size=n_features)
condition2_rep1 = np.random.normal(loc=160, scale=28, size=n_features)
condition2_rep2 = np.random.normal(loc=155, scale=32, size=n_features)
condition2_rep3
# Create a DataFrame
= pd.DataFrame({
data 'Cond1_Rep1': condition1_rep1,
'Cond1_Rep2': condition1_rep2,
'Cond1_Rep3': condition1_rep3,
'Cond2_Rep1': condition2_rep1,
'Cond2_Rep2': condition2_rep2,
'Cond2_Rep3': condition2_rep3
})
# Apply the normalization function
= norm_loading_TMT(data)
data_normalized
# Set up the figure for visualization
= plt.subplots(1, 2, figsize=(10, 6))
fig, (ax1, ax2)
# Color mapping for conditions
= ['#3498db', '#e74c3c'] # Blue for Condition 1, Red for Condition 2
colors = {
condition_colors 'Cond1_Rep1': colors[0], 'Cond1_Rep2': colors[0], 'Cond1_Rep3': colors[0],
'Cond2_Rep1': colors[1], 'Cond2_Rep2': colors[1], 'Cond2_Rep3': colors[1]
}
# 1. Boxplot for raw data (before normalization)
'Before Normalization', fontsize=14, fontweight='bold')
ax1.set_title(=data, ax=ax1, palette=condition_colors)
sns.boxplot(data# Get current tick positions
= ax1.get_xticks()
ax1_ticks = [label.get_text() for label in ax1.get_xticklabels()]
ax1_labels # Set ticks and then ticklabels
ax1.set_xticks(ax1_ticks)=45, ha='right')
ax1.set_xticklabels(ax1_labels, rotation'Value', fontsize=12)
ax1.set_ylabel(='y', linestyle='--', alpha=0.7)
ax1.grid(axis
# 2. Boxplot for normalized data
'After Normalization', fontsize=14, fontweight='bold')
ax2.set_title(=data_normalized, ax=ax2, palette=condition_colors)
sns.boxplot(data# Get current tick positions
= ax2.get_xticks()
ax2_ticks = [label.get_text() for label in ax2.get_xticklabels()]
ax2_labels # Set ticks and then ticklabels
ax2.set_xticks(ax2_ticks)=45,ha='right')
ax2.set_xticklabels(ax2_labels, rotation'Normalized Value', fontsize=12)
ax2.set_ylabel(='y', linestyle='--', alpha=0.7)
ax2.grid(axis
# Create a custom legend for conditions
from matplotlib.patches import Patch
= [
legend_elements =colors[0], label='Condition 1'),
Patch(facecolor=colors[1], label='Condition 2')
Patch(facecolor
]=legend_elements, loc='upper center', bbox_to_anchor=(0.5, 0.05), ncol=2)
fig.legend(handles
# Adjust layout to make room for annotations
=[0, 0.1, 1, 0.9])
plt.tight_layout(rect plt.show()
ires_norm
ires_norm (df, exps_columns)
*Implement Internal Reference Scaling (IRS) normalization for combining multiple TMT experiments.
This function normalizes and integrates data from multiple TMT experiments by: 1. Computing the sum of each protein’s intensity across all channels within each experiment 2. Calculating the geometric mean of these sums across experiments (reference value) 3. Deriving scaling factors to adjust each experiment to this reference 4. Applying an additional total sum normalization to the combined dataset*
# Set random seed for reproducibility
42)
np.random.seed(
# Generate synthetic data with controlled medians
# Creating a dataset with 100 features (rows) and 6 samples (columns)
# - 3 replicates for condition 1 (lower median)
# - 3 replicates for condition 2 (higher median)
# Number of features (e.g., proteins, genes)
= 100
n_features
# Create condition 1 data (3 replicates with similar distribution)
= np.random.normal(loc=100, scale=25, size=n_features)
condition1_rep1 = np.random.normal(loc=95, scale=20, size=n_features)
condition1_rep2 = np.random.normal(loc=90, scale=22, size=n_features)
condition1_rep3
# Create condition 2 data (3 replicates with higher median)
= np.random.normal(loc=150, scale=30, size=n_features)
condition2_rep1 = np.random.normal(loc=160, scale=28, size=n_features)
condition2_rep2 = np.random.normal(loc=155, scale=32, size=n_features)
condition2_rep3
# Create a DataFrame
= pd.DataFrame({
data 'Cond1_Rep1': condition1_rep1,
'Cond1_Rep2': condition1_rep2,
'Cond1_Rep3': condition1_rep3,
'Cond2_Rep1': condition2_rep1,
'Cond2_Rep2': condition2_rep2,
'Cond2_Rep3': condition2_rep3
})
# Apply the normalization function
= ires_norm(data,[['Cond1_Rep1','Cond1_Rep2','Cond1_Rep3' ],['Cond2_Rep1','Cond2_Rep2','Cond2_Rep3' ]])
data_normalized
# Set up the figure for visualization
= plt.subplots(1, 2, figsize=(10, 6))
fig, (ax1, ax2)
# Color mapping for conditions
= ['#3498db', '#e74c3c'] # Blue for Condition 1, Red for Condition 2
colors = {
condition_colors 'Cond1_Rep1': colors[0], 'Cond1_Rep2': colors[0], 'Cond1_Rep3': colors[0],
'Cond2_Rep1': colors[1], 'Cond2_Rep2': colors[1], 'Cond2_Rep3': colors[1]
}
# 1. Boxplot for raw data (before normalization)
'Before Normalization', fontsize=14, fontweight='bold')
ax1.set_title(=data, ax=ax1, palette=condition_colors)
sns.boxplot(data# Get current tick positions
= ax1.get_xticks()
ax1_ticks = [label.get_text() for label in ax1.get_xticklabels()]
ax1_labels # Set ticks and then ticklabels
ax1.set_xticks(ax1_ticks)=45, ha='right')
ax1.set_xticklabels(ax1_labels, rotation'Value', fontsize=12)
ax1.set_ylabel(='y', linestyle='--', alpha=0.7)
ax1.grid(axis
# 2. Boxplot for normalized data
'After Normalization', fontsize=14, fontweight='bold')
ax2.set_title(=data_normalized, ax=ax2, palette=condition_colors)
sns.boxplot(data# Get current tick positions
= ax2.get_xticks()
ax2_ticks = [label.get_text() for label in ax2.get_xticklabels()]
ax2_labels # Set ticks and then ticklabels
ax2.set_xticks(ax2_ticks)=45,ha='right')
ax2.set_xticklabels(ax2_labels, rotation'Normalized Value', fontsize=12)
ax2.set_ylabel(='y', linestyle='--', alpha=0.7)
ax2.grid(axis
# Create a custom legend for conditions
from matplotlib.patches import Patch
= [
legend_elements =colors[0], label='Condition 1'),
Patch(facecolor=colors[1], label='Condition 2')
Patch(facecolor
]=legend_elements, loc='upper center', bbox_to_anchor=(0.5, 0.05), ncol=2)
fig.legend(handles
# Adjust layout to make room for annotations
=[0, 0.1, 1, 0.9])
plt.tight_layout(rect plt.show()
clean_id
clean_id (temp_id)
mod_hist_legend
mod_hist_legend (ax, title=False)
Creates a cleaner legend for histogram plots by using line elements instead of patches. when using step Motivation: - Default histogram legends show rectangle patches which can be visually distracting - This function creates a more elegant legend with simple lines matching histogram edge colors - Positions the legend outside the plot to avoid overlapping with data
# Create sample data for multiple distributions
42) # For reproducibility
np.random.seed(= np.random.normal(0, 1, 1000)
data_a = np.random.normal(3, 1.5, 1500)
data_b
# Create a figure with 2 subplots side by side
= plt.subplots(1, 2, figsize=(10, 4))
fig, (ax1, ax2)
# Left subplot: Default histogram legend
=30, alpha=0.7, label='Distribution A', edgecolor='blue', histtype='step')
ax1.hist(data_a, bins=30, alpha=0.7, label='Distribution B', edgecolor='red', histtype='step')
ax1.hist(data_b, bins'Default Legend')
ax1.set_title(# Default legend
ax1.legend()
# Right subplot: Modified histogram legend
=30, alpha=0.7, label='Distribution A', edgecolor='blue', histtype='step')
ax2.hist(data_a, bins=30, alpha=0.7, label='Distribution B', edgecolor='red', histtype='step')
ax2.hist(data_b, bins'Modified Legend')
ax2.set_title(='Distributions') # Apply our function
mod_hist_legend(ax2, title
# Adjust layout to give space for the right-side legend
plt.tight_layout()=0.85)
fig.subplots_adjust(right
# Display the figure
plt.show()
clean_axes
clean_axes (ax, offset=10)
Customizes a matplotlib axes by removing top and right spines, and creating a broken axis effect where x and y axes don’t touch.
# Create sample data for multiple distributions
42) # For reproducibility
np.random.seed(= np.random.normal(0, 1, 1000)
data_a = np.random.normal(3, 1.5, 1500)
data_b
# Create a figure with 2 subplots side by side
= plt.subplots(1, 2, figsize=(10, 4))
fig, (ax1, ax2)
# Left subplot: Default histogram legend
=30, alpha=0.7, label='Distribution A', edgecolor='blue', histtype='step')
ax1.hist(data_a, bins=30, alpha=0.7, label='Distribution B', edgecolor='red', histtype='step')
ax1.hist(data_b, bins'Default Legend')
ax1.set_title(# Default legend
ax1.legend()
# Right subplot: Modified histogram legend
=30, alpha=0.7, label='Distribution A', edgecolor='blue', histtype='step')
ax2.hist(data_a, bins=30, alpha=0.7, label='Distribution B', edgecolor='red', histtype='step')
ax2.hist(data_b, bins'Modified Axes')
ax2.set_title(='Distributions') # Apply our function
mod_hist_legend(ax2, title
clean_axes(ax2)# Adjust layout to give space for the right-side legend
plt.tight_layout()=0.85)
fig.subplots_adjust(right
# Display the figure
plt.show()
add_desc
add_desc (data, prot_to_desc)
parse_fasta_file
parse_fasta_file (fasta_file)
create a dictionary of protein id to gene product using fasta file from tritrypDB
get_scaled_df
get_scaled_df (df)
elbow_point
elbow_point (values)
Find the elbow point in a curve using the maximum curvature method.
Type | Details | |
---|---|---|
values | list | The y-values of the curve. |
Returns | int | The index of the elbow point. |
kmeans_cluster_analysis
kmeans_cluster_analysis (df, cluster_sizes, random_state=42, features=None, figsize=(12, 6), standardize=False, fill_na=False)
Perform K-means clustering analysis on a pandas DataFrame and visualize the results with both normalized inertia and silhouette scores on the same plot.
Type | Default | Details | |
---|---|---|---|
df | pandas.DataFrame | The input data to cluster. | |
cluster_sizes | list | List of cluster sizes (k values) to evaluate. | |
random_state | int | 42 | Random seed for reproducibility (default: 42). |
features | NoneType | None | List of column names to use for clustering. If None, all columns are used. |
figsize | tuple | (12, 6) | Figure size for the output plot (default: (12, 6)). |
standardize | bool | False | Whether to standardize the features (default: False). |
fill_na | bool | False | Whether to fill missing values with column means (default: False). |
Returns | tuple | (figure, inertia_values, silhouette_values) - The matplotlib figure object, the list of inertia values, and the list of silhouette scores. |
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import make_blobs
# Create synthetic dataset with 4 natural clusters
= make_blobs(
X, y =400,
n_samples=4,
centers=0.8,
cluster_std=42
random_state
)
# Convert to DataFrame
= pd.DataFrame(X, columns=['feature1', 'feature2'])
df
# Print basic information about the dataset
print(f"Dataset shape: {df.shape}")
print(df.head())
# Define the range of cluster sizes to test
= list(range(1, 11)) # Test k from 1 to 10
cluster_sizes
# Run the kmeans cluster analysis
= kmeans_cluster_analysis(
fig, ax, inertia_values, silhouette_values =df,
df=cluster_sizes,
cluster_sizes=42,
random_state=True, # Standardize the features
standardize=(12, 7)
figsize
)
# Now you can further customize the plot using the ax object
'#f8f9fa') # Light gray background
ax.set_facecolor('K-means Clustering Analysis for Synthetic Data', fontsize=16, fontweight='bold')
ax.set_title(
# Display the generated plot
plt.show()
# Print the actual optimal number of clusters (which should be 4 in this case)
print("\nInertia values:")
for k, inertia in zip(cluster_sizes, inertia_values):
print(f"k={k}: {inertia:.2f}")
print("\nSilhouette scores:")
for k, silhouette in zip(cluster_sizes, silhouette_values):
if k > 1: # Silhouette score not defined for k=1
print(f"k={k}: {silhouette:.4f}")
# Create a scatter plot of the data with the optimal cluster assignment (k=4)
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
# Standardize the data
= StandardScaler()
scaler = scaler.fit_transform(df)
X_scaled
# Fit KMeans with k=4
= KMeans(n_clusters=4, random_state=42, n_init=10)
kmeans = kmeans.fit_predict(X_scaled)
labels
# Create a scatter plot with cluster assignments using fig, ax
= plt.subplots(figsize=(10, 8))
fig, ax = ax.scatter(X[:, 0], X[:, 1], c=labels, cmap='viridis', s=50, alpha=0.8)
scatter 'K-means Clustering Result (k=4)', fontsize=15)
ax.set_title('Feature 1', fontsize=12)
ax.set_xlabel('Feature 2', fontsize=12)
ax.set_ylabel(True, linestyle='--', alpha=0.7)
ax.grid(
fig.tight_layout()
plt.show()
# Verify the implementation by comparing with manually calculated metrics
# For k=4, calculate inertia manually
= 0
manual_inertia for i, point in enumerate(X_scaled):
= kmeans.cluster_centers_[labels[i]]
centroid += np.sum((point - centroid) ** 2)
manual_inertia
print(f"\nVerification for k=4:")
print(f"KMeans inertia: {kmeans.inertia_:.4f}")
print(f"Manually calculated inertia: {manual_inertia:.4f}")
Dataset shape: (400, 2)
feature1 feature2
0 -9.862671 8.727358
1 -4.604994 9.671808
2 -9.034922 7.105344
3 5.419975 1.855524
4 5.096591 2.881622
Standardizing features.
Inertia values:
k=1: 800.00
k=2: 417.20
k=3: 89.69
k=4: 15.00
k=5: 13.43
k=6: 11.91
k=7: 10.48
k=8: 9.25
k=9: 8.48
k=10: 7.65
Silhouette scores:
k=2: 0.5702
k=3: 0.7638
k=4: 0.8403
k=5: 0.7040
k=6: 0.5770
k=7: 0.4511
k=8: 0.3408
k=9: 0.3458
k=10: 0.3538
Verification for k=4:
KMeans inertia: 14.9956
Manually calculated inertia: 14.9956