use cluster affinities

This commit is contained in:
Dylan Knutson
2024-12-28 03:32:38 +00:00
parent 9b4316e819
commit ab5f379b94
3 changed files with 169 additions and 83 deletions

View File

@@ -155,7 +155,7 @@ async fn main() -> Result<()> {
let mut items_per_cluster = vec![Vec::new(); args.item_clusters as usize];
let items_per_cluster_count = args.num_items / args.item_clusters;
// Drop existing item_clusters table and recreate with 3D centers
// Drop existing item_clusters table and recreate with item affinity vectors
client
.execute("DROP TABLE IF EXISTS item_clusters", &[])
.await?;
@@ -164,9 +164,7 @@ async fn main() -> Result<()> {
"CREATE TABLE item_clusters (
item_id INTEGER PRIMARY KEY,
cluster_id INTEGER,
center_x FLOAT,
center_y FLOAT,
center_z FLOAT
cluster_affinities FLOAT[] -- Array of affinities to archetypal items
)",
&[],
)
@@ -177,18 +175,53 @@ async fn main() -> Result<()> {
.execute(&format!("TRUNCATE TABLE {}", args.interactions_table), &[])
.await?;
// Create cluster affinity vectors based on item co-occurrence patterns
let mut cluster_affinities =
vec![vec![0.0; args.num_items as usize]; args.item_clusters as usize];
// For each cluster, select a set of archetypal items that define the cluster
for cluster_id in 0..args.item_clusters {
let start_idx = (cluster_id * args.num_items as i32 / args.item_clusters) as usize;
let end_idx = ((cluster_id + 1) * args.num_items as i32 / args.item_clusters) as usize;
// Set high affinity for items in this cluster's range
for i in start_idx..end_idx {
cluster_affinities[cluster_id as usize][i] = 1.0;
}
// Add some lower affinity to neighboring clusters' items
let noise = args.noise_level;
if cluster_id > 0 {
let prev_start =
((cluster_id - 1) * args.num_items as i32 / args.item_clusters) as usize;
let prev_end = (cluster_id * args.num_items as i32 / args.item_clusters) as usize;
for i in prev_start..prev_end {
cluster_affinities[cluster_id as usize][i] = noise;
}
}
if cluster_id < args.item_clusters - 1 {
let next_start =
((cluster_id + 1) * args.num_items as i32 / args.item_clusters) as usize;
let next_end = ((cluster_id + 2) * args.num_items as i32 / args.item_clusters) as usize;
for i in next_start..next_end {
cluster_affinities[cluster_id as usize][i] = noise;
}
}
}
// Assign items to clusters and store their affinity vectors
for (i, &item_id) in all_items.iter().enumerate() {
let cluster_id = (i as i32) / items_per_cluster_count;
if cluster_id < args.item_clusters {
items_per_cluster[cluster_id as usize].push(item_id);
// Store cluster assignment with 3D center
let center = &cluster_centers[cluster_id as usize];
// Store cluster assignment with affinity vector
let affinities: Vec<f64> = cluster_affinities[cluster_id as usize].clone();
client
.execute(
"INSERT INTO item_clusters (item_id, cluster_id, center_x, center_y, center_z)
VALUES ($1, $2, $3, $4, $5)",
&[&item_id, &cluster_id, &center[0], &center[1], &center[2]],
"INSERT INTO item_clusters (item_id, cluster_id, cluster_affinities)
VALUES ($1, $2, $3)",
&[&item_id, &cluster_id, &affinities],
)
.await?;
}

View File

@@ -41,45 +41,43 @@ async fn create_pool() -> Result<Pool> {
async fn analyze_cluster_cohesion(client: &tokio_postgres::Client, args: &Args) -> Result<()> {
info!("Analyzing cluster cohesion...");
// Analyze cluster cohesion
info!("Analyzing cluster cohesion...");
// Calculate cosine similarity between affinity vectors
let cohesion_stats = client
.query_one(
"WITH embedding_distances AS (
"WITH affinity_similarities AS (
SELECT
a.item_id as item1,
b.item_id as item2,
a.cluster_id as cluster1,
b.cluster_id as cluster2,
SQRT(
POW(a.center_x - b.center_x, 2) +
POW(a.center_y - b.center_y, 2) +
POW(a.center_z - b.center_z, 2)
) as distance,
CASE WHEN a.cluster_id = b.cluster_id THEN 'within' ELSE 'between' END as distance_type
-- Compute cosine similarity between affinity vectors
SUM(a1 * b1) / (SQRT(SUM(a1 * a1)) * SQRT(SUM(b1 * b1))) as similarity,
CASE WHEN a.cluster_id = b.cluster_id THEN 'within' ELSE 'between' END as similarity_type
FROM item_clusters a
CROSS JOIN item_clusters b
CROSS JOIN UNNEST(a.cluster_affinities, b.cluster_affinities) AS t(a1, b1)
WHERE a.item_id < b.item_id
GROUP BY a.item_id, b.item_id, a.cluster_id, b.cluster_id
)
SELECT
AVG(CASE WHEN distance_type = 'within' THEN distance END) as avg_within,
STDDEV(CASE WHEN distance_type = 'within' THEN distance END) as stddev_within,
MIN(CASE WHEN distance_type = 'within' THEN distance END) as min_within,
MAX(CASE WHEN distance_type = 'within' THEN distance END) as max_within,
COUNT(CASE WHEN distance_type = 'within' THEN 1 END) as count_within,
AVG(CASE WHEN distance_type = 'between' THEN distance END) as avg_between,
STDDEV(CASE WHEN distance_type = 'between' THEN distance END) as stddev_between,
MIN(CASE WHEN distance_type = 'between' THEN distance END) as min_between,
MAX(CASE WHEN distance_type = 'between' THEN distance END) as max_between,
COUNT(CASE WHEN distance_type = 'between' THEN 1 END) as count_between
FROM embedding_distances",
AVG(CASE WHEN similarity_type = 'within' THEN similarity END) as avg_within,
STDDEV(CASE WHEN similarity_type = 'within' THEN similarity END) as stddev_within,
MIN(CASE WHEN similarity_type = 'within' THEN similarity END) as min_within,
MAX(CASE WHEN similarity_type = 'within' THEN similarity END) as max_within,
COUNT(CASE WHEN similarity_type = 'within' THEN 1 END) as count_within,
AVG(CASE WHEN similarity_type = 'between' THEN similarity END) as avg_between,
STDDEV(CASE WHEN similarity_type = 'between' THEN similarity END) as stddev_between,
MIN(CASE WHEN similarity_type = 'between' THEN similarity END) as min_between,
MAX(CASE WHEN similarity_type = 'between' THEN similarity END) as max_between,
COUNT(CASE WHEN similarity_type = 'between' THEN 1 END) as count_between
FROM affinity_similarities",
&[],
)
.await?;
// Print cohesion statistics
info!(
"Within Cluster: avg={:.3}, stddev={:.3}, min={:.3}, max={:.3}, pairs={}",
"Within Cluster Similarity: avg={:.3}, stddev={:.3}, min={:.3}, max={:.3}, pairs={}",
cohesion_stats.get::<_, f64>("avg_within"),
cohesion_stats.get::<_, f64>("stddev_within"),
cohesion_stats.get::<_, f64>("min_within"),
@@ -88,7 +86,7 @@ async fn analyze_cluster_cohesion(client: &tokio_postgres::Client, args: &Args)
);
info!(
"Between Clusters: avg={:.3}, stddev={:.3}, min={:.3}, max={:.3}, pairs={}",
"Between Clusters Similarity: avg={:.3}, stddev={:.3}, min={:.3}, max={:.3}, pairs={}",
cohesion_stats.get::<_, f64>("avg_between"),
cohesion_stats.get::<_, f64>("stddev_between"),
cohesion_stats.get::<_, f64>("min_between"),
@@ -100,23 +98,21 @@ async fn analyze_cluster_cohesion(client: &tokio_postgres::Client, args: &Args)
info!("\nPer-cluster cohesion:");
let cluster_stats = client
.query(
"WITH cluster_distances AS (
"WITH cluster_similarities AS (
SELECT
a.cluster_id,
SQRT(
POW(a.center_x - b.center_x, 2) +
POW(a.center_y - b.center_y, 2) +
POW(a.center_z - b.center_z, 2)
) as distance
SUM(a1 * b1) / (SQRT(SUM(a1 * a1)) * SQRT(SUM(b1 * b1))) as similarity
FROM item_clusters a
JOIN item_clusters b ON a.cluster_id = b.cluster_id AND a.item_id < b.item_id
CROSS JOIN UNNEST(a.cluster_affinities, b.cluster_affinities) AS t(a1, b1)
GROUP BY a.cluster_id, a.item_id, b.item_id
)
SELECT
cluster_id,
AVG(distance) as avg_distance,
STDDEV(distance) as stddev_distance,
AVG(similarity) as avg_similarity,
STDDEV(similarity) as stddev_similarity,
COUNT(*) as num_pairs
FROM cluster_distances
FROM cluster_similarities
GROUP BY cluster_id
ORDER BY cluster_id",
&[],
@@ -125,40 +121,36 @@ async fn analyze_cluster_cohesion(client: &tokio_postgres::Client, args: &Args)
for row in cluster_stats {
let cluster_id: i32 = row.get("cluster_id");
let avg_distance: f64 = row.get("avg_distance");
let stddev_distance: f64 = row.get("stddev_distance");
let avg_similarity: f64 = row.get("avg_similarity");
let stddev_similarity: f64 = row.get("stddev_similarity");
let num_pairs: i64 = row.get("num_pairs");
info!(
"Cluster {}: avg={:.3}, stddev={:.3}, pairs={}",
cluster_id, avg_distance, stddev_distance, num_pairs
"Cluster {}: avg_similarity={:.3}, stddev={:.3}, pairs={}",
cluster_id, avg_similarity, stddev_similarity, num_pairs
);
}
// Calculate separation between specific cluster pairs
let query = format!(
"WITH distances AS (
"WITH similarities AS (
SELECT
a.cluster_id as cluster1,
b.cluster_id as cluster2,
SQRT(SUM((e1 - e2) * (e1 - e2)))::float8 as distance
FROM {} a
JOIN {} b ON a.item_id < b.item_id
JOIN {} ie1 ON a.item_id = ie1.item_id
JOIN {} ie2 ON b.item_id = ie2.item_id,
UNNEST(ie1.embedding, ie2.embedding) AS t(e1, e2)
WHERE a.cluster_id < b.cluster_id
GROUP BY a.item_id, b.item_id, a.cluster_id, b.cluster_id
SUM(a1 * b1) / (SQRT(SUM(a1 * a1)) * SQRT(SUM(b1 * b1))) as similarity
FROM item_clusters a
JOIN item_clusters b ON a.cluster_id < b.cluster_id
CROSS JOIN UNNEST(a.cluster_affinities, b.cluster_affinities) AS t(a1, b1)
GROUP BY a.cluster_id, b.cluster_id, a.item_id, b.item_id
)
SELECT
cluster1,
cluster2,
AVG(distance)::float8 as avg_distance,
STDDEV(distance)::float8 as stddev_distance,
AVG(similarity) as avg_similarity,
STDDEV(similarity) as stddev_similarity,
COUNT(*) as num_pairs
FROM distances
FROM similarities
GROUP BY cluster1, cluster2
ORDER BY cluster1, cluster2",
args.clusters_table, args.clusters_table, args.embeddings_table, args.embeddings_table
);
info!("\nBetween-cluster separation:");
@@ -171,7 +163,7 @@ async fn analyze_cluster_cohesion(client: &tokio_postgres::Client, args: &Args)
let count: i64 = row.get(4);
info!(
"Clusters {} <-> {}: avg_distance={:.3}±{:.3} ({} pairs)",
"Clusters {} <-> {}: avg_similarity={:.3}±{:.3} ({} pairs)",
cluster1, cluster2, avg, stddev, count
);
}
@@ -231,32 +223,29 @@ async fn analyze_embedding_stats(client: &tokio_postgres::Client, args: &Args) -
}
async fn analyze_cluster_correlation(client: &tokio_postgres::Client, args: &Args) -> Result<()> {
info!("Analyzing correlation between cluster centers and embedding distances...");
info!("Analyzing correlation between cluster affinities and embedding similarities...");
// Calculate correlation between cluster center distances and embedding distances
// Calculate correlation between affinity similarities and embedding similarities
let correlation = client
.query_one(
"WITH distances AS (
SELECT
a.cluster_id as cluster1,
b.cluster_id as cluster2,
SQRT(
POW(a.center_x - b.center_x, 2) +
POW(a.center_y - b.center_y, 2) +
POW(a.center_z - b.center_z, 2)
) as center_distance,
SQRT(
POW(ae.embedding[1] - be.embedding[1], 2) +
POW(ae.embedding[2] - be.embedding[2], 2) +
POW(ae.embedding[3] - be.embedding[3], 2)
) as embedding_distance
-- Compute affinity similarity
SUM(a1 * b1) / (SQRT(SUM(a1 * a1)) * SQRT(SUM(b1 * b1))) as affinity_similarity,
-- Compute embedding similarity
SUM(e1 * e2) / (SQRT(SUM(e1 * e1)) * SQRT(SUM(e2 * e2))) as embedding_similarity
FROM item_clusters a
JOIN item_clusters b ON a.cluster_id < b.cluster_id
JOIN item_embeddings ae ON a.item_id = ae.item_id
JOIN item_embeddings be ON b.item_id = be.item_id
CROSS JOIN UNNEST(a.cluster_affinities, b.cluster_affinities) AS t1(a1, b1)
CROSS JOIN UNNEST(ae.embedding, be.embedding) AS t2(e1, e2)
GROUP BY a.cluster_id, b.cluster_id, a.item_id, b.item_id
)
SELECT
corr(center_distance, embedding_distance) as correlation,
corr(affinity_similarity, embedding_similarity) as correlation,
COUNT(*) as num_pairs
FROM distances",
&[],
@@ -266,7 +255,7 @@ async fn analyze_cluster_correlation(client: &tokio_postgres::Client, args: &Arg
let correlation_value: f64 = correlation.get("correlation");
let num_pairs: i64 = correlation.get("num_pairs");
info!(
"Correlation between cluster center distances and embedding distances: {:.3} ({} pairs)",
"Correlation between affinity similarities and embedding similarities: {:.3} ({} pairs)",
correlation_value, num_pairs
);

View File

@@ -72,7 +72,7 @@ async fn main() -> Result<()> {
// Load embeddings and cluster information
info!("Loading embeddings and cluster information...");
let query = format!(
"SELECT e.item_id, e.embedding, c.cluster_id
"SELECT e.item_id, e.embedding, c.cluster_id, c.cluster_affinities
FROM {} e
JOIN {} c ON e.item_id = c.item_id
ORDER BY e.item_id",
@@ -92,22 +92,34 @@ async fn main() -> Result<()> {
let mut data = Array2::zeros((n_items, n_dims));
let mut item_ids = Vec::with_capacity(n_items);
let mut cluster_ids = Vec::with_capacity(n_items);
let mut affinity_data = Array2::zeros((n_items, n_dims)); // Changed from n_items to n_dims for affinity dimension
for (i, row) in rows.iter().enumerate() {
let item_id: i32 = row.get(0);
let embedding: Vec<f64> = row.get(1);
let cluster_id: i32 = row.get(2);
let affinities: Vec<f64> = row.get(3);
item_ids.push(item_id);
cluster_ids.push(cluster_id);
data.row_mut(i).assign(&ArrayView1::from(&embedding));
// Ensure affinity vector has the right length by truncating or padding if necessary
let mut affinity_vec = vec![0.0; n_dims];
for (j, &val) in affinities.iter().take(n_dims).enumerate() {
affinity_vec[j] = val;
}
affinity_data
.row_mut(i)
.assign(&ArrayView1::from(&affinity_vec));
}
// Perform PCA with 3 components
// Perform PCA on both embeddings and affinity vectors
info!("Performing PCA...");
let projected_data = perform_pca(&data, 3)?;
let projected_embeddings = perform_pca(&data, 3)?;
let projected_affinities = perform_pca(&affinity_data, 3)?;
// Create scatter plot for each cluster
// Create scatter plot for each cluster using embeddings
let mut plot = Plot::new();
let unique_clusters: Vec<_> = cluster_ids
.iter()
@@ -116,24 +128,34 @@ async fn main() -> Result<()> {
.into_iter()
.collect();
for cluster_id in unique_clusters {
// Plot embeddings
for cluster_id in &unique_clusters {
let indices: Vec<_> = cluster_ids
.iter()
.enumerate()
.filter(|(_, &c)| c == cluster_id)
.filter(|(_, &c)| c == *cluster_id)
.map(|(i, _)| i)
.collect();
let x: Vec<_> = indices.iter().map(|&i| projected_data[[i, 0]]).collect();
let y: Vec<_> = indices.iter().map(|&i| projected_data[[i, 1]]).collect();
let z: Vec<_> = indices.iter().map(|&i| projected_data[[i, 2]]).collect();
let x: Vec<_> = indices
.iter()
.map(|&i| projected_embeddings[[i, 0]])
.collect();
let y: Vec<_> = indices
.iter()
.map(|&i| projected_embeddings[[i, 1]])
.collect();
let z: Vec<_> = indices
.iter()
.map(|&i| projected_embeddings[[i, 2]])
.collect();
let text: Vec<_> = indices
.iter()
.map(|&i| format!("Item {}", item_ids[i]))
.collect();
let trace = Scatter3D::new(x, y, z)
.name(&format!("Cluster {}", cluster_id))
.name(&format!("Cluster {} (Embeddings)", cluster_id))
.mode(Mode::Markers)
.text_array(text)
.marker(
@@ -146,9 +168,51 @@ async fn main() -> Result<()> {
plot.add_trace(trace);
}
// Plot affinity vectors
for cluster_id in &unique_clusters {
let indices: Vec<_> = cluster_ids
.iter()
.enumerate()
.filter(|(_, &c)| c == *cluster_id)
.map(|(i, _)| i)
.collect();
let x: Vec<_> = indices
.iter()
.map(|&i| projected_affinities[[i, 0]])
.collect();
let y: Vec<_> = indices
.iter()
.map(|&i| projected_affinities[[i, 1]])
.collect();
let z: Vec<_> = indices
.iter()
.map(|&i| projected_affinities[[i, 2]])
.collect();
let text: Vec<_> = indices
.iter()
.map(|&i| format!("Item {}", item_ids[i]))
.collect();
let trace = Scatter3D::new(x, y, z)
.name(&format!("Cluster {} (Affinities)", cluster_id))
.mode(Mode::Markers)
.text_array(text)
.marker(
plotly::common::Marker::new()
.size(8)
.symbol(plotly::common::MarkerSymbol::Square),
)
.show_legend(true);
plot.add_trace(trace);
}
plot.set_layout(
Layout::new()
.title(Title::new("Item Embeddings Visualization (PCA)"))
.title(Title::new(
"Item Embeddings and Affinities Visualization (PCA)",
))
.show_legend(true)
.legend(Legend::new().x(1.0).y(0.5))
.margin(Margin::new().left(100).right(100).top(100).bottom(100))