use cluster affinities
This commit is contained in:
@@ -155,7 +155,7 @@ async fn main() -> Result<()> {
|
||||
let mut items_per_cluster = vec![Vec::new(); args.item_clusters as usize];
|
||||
let items_per_cluster_count = args.num_items / args.item_clusters;
|
||||
|
||||
// Drop existing item_clusters table and recreate with 3D centers
|
||||
// Drop existing item_clusters table and recreate with item affinity vectors
|
||||
client
|
||||
.execute("DROP TABLE IF EXISTS item_clusters", &[])
|
||||
.await?;
|
||||
@@ -164,9 +164,7 @@ async fn main() -> Result<()> {
|
||||
"CREATE TABLE item_clusters (
|
||||
item_id INTEGER PRIMARY KEY,
|
||||
cluster_id INTEGER,
|
||||
center_x FLOAT,
|
||||
center_y FLOAT,
|
||||
center_z FLOAT
|
||||
cluster_affinities FLOAT[] -- Array of affinities to archetypal items
|
||||
)",
|
||||
&[],
|
||||
)
|
||||
@@ -177,18 +175,53 @@ async fn main() -> Result<()> {
|
||||
.execute(&format!("TRUNCATE TABLE {}", args.interactions_table), &[])
|
||||
.await?;
|
||||
|
||||
// Create cluster affinity vectors based on item co-occurrence patterns
|
||||
let mut cluster_affinities =
|
||||
vec![vec![0.0; args.num_items as usize]; args.item_clusters as usize];
|
||||
|
||||
// For each cluster, select a set of archetypal items that define the cluster
|
||||
for cluster_id in 0..args.item_clusters {
|
||||
let start_idx = (cluster_id * args.num_items as i32 / args.item_clusters) as usize;
|
||||
let end_idx = ((cluster_id + 1) * args.num_items as i32 / args.item_clusters) as usize;
|
||||
|
||||
// Set high affinity for items in this cluster's range
|
||||
for i in start_idx..end_idx {
|
||||
cluster_affinities[cluster_id as usize][i] = 1.0;
|
||||
}
|
||||
|
||||
// Add some lower affinity to neighboring clusters' items
|
||||
let noise = args.noise_level;
|
||||
if cluster_id > 0 {
|
||||
let prev_start =
|
||||
((cluster_id - 1) * args.num_items as i32 / args.item_clusters) as usize;
|
||||
let prev_end = (cluster_id * args.num_items as i32 / args.item_clusters) as usize;
|
||||
for i in prev_start..prev_end {
|
||||
cluster_affinities[cluster_id as usize][i] = noise;
|
||||
}
|
||||
}
|
||||
if cluster_id < args.item_clusters - 1 {
|
||||
let next_start =
|
||||
((cluster_id + 1) * args.num_items as i32 / args.item_clusters) as usize;
|
||||
let next_end = ((cluster_id + 2) * args.num_items as i32 / args.item_clusters) as usize;
|
||||
for i in next_start..next_end {
|
||||
cluster_affinities[cluster_id as usize][i] = noise;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Assign items to clusters and store their affinity vectors
|
||||
for (i, &item_id) in all_items.iter().enumerate() {
|
||||
let cluster_id = (i as i32) / items_per_cluster_count;
|
||||
if cluster_id < args.item_clusters {
|
||||
items_per_cluster[cluster_id as usize].push(item_id);
|
||||
|
||||
// Store cluster assignment with 3D center
|
||||
let center = &cluster_centers[cluster_id as usize];
|
||||
// Store cluster assignment with affinity vector
|
||||
let affinities: Vec<f64> = cluster_affinities[cluster_id as usize].clone();
|
||||
client
|
||||
.execute(
|
||||
"INSERT INTO item_clusters (item_id, cluster_id, center_x, center_y, center_z)
|
||||
VALUES ($1, $2, $3, $4, $5)",
|
||||
&[&item_id, &cluster_id, ¢er[0], ¢er[1], ¢er[2]],
|
||||
"INSERT INTO item_clusters (item_id, cluster_id, cluster_affinities)
|
||||
VALUES ($1, $2, $3)",
|
||||
&[&item_id, &cluster_id, &affinities],
|
||||
)
|
||||
.await?;
|
||||
}
|
||||
|
||||
@@ -41,45 +41,43 @@ async fn create_pool() -> Result<Pool> {
|
||||
async fn analyze_cluster_cohesion(client: &tokio_postgres::Client, args: &Args) -> Result<()> {
|
||||
info!("Analyzing cluster cohesion...");
|
||||
|
||||
// Analyze cluster cohesion
|
||||
info!("Analyzing cluster cohesion...");
|
||||
// Calculate cosine similarity between affinity vectors
|
||||
let cohesion_stats = client
|
||||
.query_one(
|
||||
"WITH embedding_distances AS (
|
||||
"WITH affinity_similarities AS (
|
||||
SELECT
|
||||
a.item_id as item1,
|
||||
b.item_id as item2,
|
||||
a.cluster_id as cluster1,
|
||||
b.cluster_id as cluster2,
|
||||
SQRT(
|
||||
POW(a.center_x - b.center_x, 2) +
|
||||
POW(a.center_y - b.center_y, 2) +
|
||||
POW(a.center_z - b.center_z, 2)
|
||||
) as distance,
|
||||
CASE WHEN a.cluster_id = b.cluster_id THEN 'within' ELSE 'between' END as distance_type
|
||||
-- Compute cosine similarity between affinity vectors
|
||||
SUM(a1 * b1) / (SQRT(SUM(a1 * a1)) * SQRT(SUM(b1 * b1))) as similarity,
|
||||
CASE WHEN a.cluster_id = b.cluster_id THEN 'within' ELSE 'between' END as similarity_type
|
||||
FROM item_clusters a
|
||||
CROSS JOIN item_clusters b
|
||||
CROSS JOIN UNNEST(a.cluster_affinities, b.cluster_affinities) AS t(a1, b1)
|
||||
WHERE a.item_id < b.item_id
|
||||
GROUP BY a.item_id, b.item_id, a.cluster_id, b.cluster_id
|
||||
)
|
||||
SELECT
|
||||
AVG(CASE WHEN distance_type = 'within' THEN distance END) as avg_within,
|
||||
STDDEV(CASE WHEN distance_type = 'within' THEN distance END) as stddev_within,
|
||||
MIN(CASE WHEN distance_type = 'within' THEN distance END) as min_within,
|
||||
MAX(CASE WHEN distance_type = 'within' THEN distance END) as max_within,
|
||||
COUNT(CASE WHEN distance_type = 'within' THEN 1 END) as count_within,
|
||||
AVG(CASE WHEN distance_type = 'between' THEN distance END) as avg_between,
|
||||
STDDEV(CASE WHEN distance_type = 'between' THEN distance END) as stddev_between,
|
||||
MIN(CASE WHEN distance_type = 'between' THEN distance END) as min_between,
|
||||
MAX(CASE WHEN distance_type = 'between' THEN distance END) as max_between,
|
||||
COUNT(CASE WHEN distance_type = 'between' THEN 1 END) as count_between
|
||||
FROM embedding_distances",
|
||||
AVG(CASE WHEN similarity_type = 'within' THEN similarity END) as avg_within,
|
||||
STDDEV(CASE WHEN similarity_type = 'within' THEN similarity END) as stddev_within,
|
||||
MIN(CASE WHEN similarity_type = 'within' THEN similarity END) as min_within,
|
||||
MAX(CASE WHEN similarity_type = 'within' THEN similarity END) as max_within,
|
||||
COUNT(CASE WHEN similarity_type = 'within' THEN 1 END) as count_within,
|
||||
AVG(CASE WHEN similarity_type = 'between' THEN similarity END) as avg_between,
|
||||
STDDEV(CASE WHEN similarity_type = 'between' THEN similarity END) as stddev_between,
|
||||
MIN(CASE WHEN similarity_type = 'between' THEN similarity END) as min_between,
|
||||
MAX(CASE WHEN similarity_type = 'between' THEN similarity END) as max_between,
|
||||
COUNT(CASE WHEN similarity_type = 'between' THEN 1 END) as count_between
|
||||
FROM affinity_similarities",
|
||||
&[],
|
||||
)
|
||||
.await?;
|
||||
|
||||
// Print cohesion statistics
|
||||
info!(
|
||||
"Within Cluster: avg={:.3}, stddev={:.3}, min={:.3}, max={:.3}, pairs={}",
|
||||
"Within Cluster Similarity: avg={:.3}, stddev={:.3}, min={:.3}, max={:.3}, pairs={}",
|
||||
cohesion_stats.get::<_, f64>("avg_within"),
|
||||
cohesion_stats.get::<_, f64>("stddev_within"),
|
||||
cohesion_stats.get::<_, f64>("min_within"),
|
||||
@@ -88,7 +86,7 @@ async fn analyze_cluster_cohesion(client: &tokio_postgres::Client, args: &Args)
|
||||
);
|
||||
|
||||
info!(
|
||||
"Between Clusters: avg={:.3}, stddev={:.3}, min={:.3}, max={:.3}, pairs={}",
|
||||
"Between Clusters Similarity: avg={:.3}, stddev={:.3}, min={:.3}, max={:.3}, pairs={}",
|
||||
cohesion_stats.get::<_, f64>("avg_between"),
|
||||
cohesion_stats.get::<_, f64>("stddev_between"),
|
||||
cohesion_stats.get::<_, f64>("min_between"),
|
||||
@@ -100,23 +98,21 @@ async fn analyze_cluster_cohesion(client: &tokio_postgres::Client, args: &Args)
|
||||
info!("\nPer-cluster cohesion:");
|
||||
let cluster_stats = client
|
||||
.query(
|
||||
"WITH cluster_distances AS (
|
||||
"WITH cluster_similarities AS (
|
||||
SELECT
|
||||
a.cluster_id,
|
||||
SQRT(
|
||||
POW(a.center_x - b.center_x, 2) +
|
||||
POW(a.center_y - b.center_y, 2) +
|
||||
POW(a.center_z - b.center_z, 2)
|
||||
) as distance
|
||||
SUM(a1 * b1) / (SQRT(SUM(a1 * a1)) * SQRT(SUM(b1 * b1))) as similarity
|
||||
FROM item_clusters a
|
||||
JOIN item_clusters b ON a.cluster_id = b.cluster_id AND a.item_id < b.item_id
|
||||
CROSS JOIN UNNEST(a.cluster_affinities, b.cluster_affinities) AS t(a1, b1)
|
||||
GROUP BY a.cluster_id, a.item_id, b.item_id
|
||||
)
|
||||
SELECT
|
||||
cluster_id,
|
||||
AVG(distance) as avg_distance,
|
||||
STDDEV(distance) as stddev_distance,
|
||||
AVG(similarity) as avg_similarity,
|
||||
STDDEV(similarity) as stddev_similarity,
|
||||
COUNT(*) as num_pairs
|
||||
FROM cluster_distances
|
||||
FROM cluster_similarities
|
||||
GROUP BY cluster_id
|
||||
ORDER BY cluster_id",
|
||||
&[],
|
||||
@@ -125,40 +121,36 @@ async fn analyze_cluster_cohesion(client: &tokio_postgres::Client, args: &Args)
|
||||
|
||||
for row in cluster_stats {
|
||||
let cluster_id: i32 = row.get("cluster_id");
|
||||
let avg_distance: f64 = row.get("avg_distance");
|
||||
let stddev_distance: f64 = row.get("stddev_distance");
|
||||
let avg_similarity: f64 = row.get("avg_similarity");
|
||||
let stddev_similarity: f64 = row.get("stddev_similarity");
|
||||
let num_pairs: i64 = row.get("num_pairs");
|
||||
info!(
|
||||
"Cluster {}: avg={:.3}, stddev={:.3}, pairs={}",
|
||||
cluster_id, avg_distance, stddev_distance, num_pairs
|
||||
"Cluster {}: avg_similarity={:.3}, stddev={:.3}, pairs={}",
|
||||
cluster_id, avg_similarity, stddev_similarity, num_pairs
|
||||
);
|
||||
}
|
||||
|
||||
// Calculate separation between specific cluster pairs
|
||||
let query = format!(
|
||||
"WITH distances AS (
|
||||
"WITH similarities AS (
|
||||
SELECT
|
||||
a.cluster_id as cluster1,
|
||||
b.cluster_id as cluster2,
|
||||
SQRT(SUM((e1 - e2) * (e1 - e2)))::float8 as distance
|
||||
FROM {} a
|
||||
JOIN {} b ON a.item_id < b.item_id
|
||||
JOIN {} ie1 ON a.item_id = ie1.item_id
|
||||
JOIN {} ie2 ON b.item_id = ie2.item_id,
|
||||
UNNEST(ie1.embedding, ie2.embedding) AS t(e1, e2)
|
||||
WHERE a.cluster_id < b.cluster_id
|
||||
GROUP BY a.item_id, b.item_id, a.cluster_id, b.cluster_id
|
||||
SUM(a1 * b1) / (SQRT(SUM(a1 * a1)) * SQRT(SUM(b1 * b1))) as similarity
|
||||
FROM item_clusters a
|
||||
JOIN item_clusters b ON a.cluster_id < b.cluster_id
|
||||
CROSS JOIN UNNEST(a.cluster_affinities, b.cluster_affinities) AS t(a1, b1)
|
||||
GROUP BY a.cluster_id, b.cluster_id, a.item_id, b.item_id
|
||||
)
|
||||
SELECT
|
||||
cluster1,
|
||||
cluster2,
|
||||
AVG(distance)::float8 as avg_distance,
|
||||
STDDEV(distance)::float8 as stddev_distance,
|
||||
AVG(similarity) as avg_similarity,
|
||||
STDDEV(similarity) as stddev_similarity,
|
||||
COUNT(*) as num_pairs
|
||||
FROM distances
|
||||
FROM similarities
|
||||
GROUP BY cluster1, cluster2
|
||||
ORDER BY cluster1, cluster2",
|
||||
args.clusters_table, args.clusters_table, args.embeddings_table, args.embeddings_table
|
||||
);
|
||||
|
||||
info!("\nBetween-cluster separation:");
|
||||
@@ -171,7 +163,7 @@ async fn analyze_cluster_cohesion(client: &tokio_postgres::Client, args: &Args)
|
||||
let count: i64 = row.get(4);
|
||||
|
||||
info!(
|
||||
"Clusters {} <-> {}: avg_distance={:.3}±{:.3} ({} pairs)",
|
||||
"Clusters {} <-> {}: avg_similarity={:.3}±{:.3} ({} pairs)",
|
||||
cluster1, cluster2, avg, stddev, count
|
||||
);
|
||||
}
|
||||
@@ -231,32 +223,29 @@ async fn analyze_embedding_stats(client: &tokio_postgres::Client, args: &Args) -
|
||||
}
|
||||
|
||||
async fn analyze_cluster_correlation(client: &tokio_postgres::Client, args: &Args) -> Result<()> {
|
||||
info!("Analyzing correlation between cluster centers and embedding distances...");
|
||||
info!("Analyzing correlation between cluster affinities and embedding similarities...");
|
||||
|
||||
// Calculate correlation between cluster center distances and embedding distances
|
||||
// Calculate correlation between affinity similarities and embedding similarities
|
||||
let correlation = client
|
||||
.query_one(
|
||||
"WITH distances AS (
|
||||
SELECT
|
||||
a.cluster_id as cluster1,
|
||||
b.cluster_id as cluster2,
|
||||
SQRT(
|
||||
POW(a.center_x - b.center_x, 2) +
|
||||
POW(a.center_y - b.center_y, 2) +
|
||||
POW(a.center_z - b.center_z, 2)
|
||||
) as center_distance,
|
||||
SQRT(
|
||||
POW(ae.embedding[1] - be.embedding[1], 2) +
|
||||
POW(ae.embedding[2] - be.embedding[2], 2) +
|
||||
POW(ae.embedding[3] - be.embedding[3], 2)
|
||||
) as embedding_distance
|
||||
-- Compute affinity similarity
|
||||
SUM(a1 * b1) / (SQRT(SUM(a1 * a1)) * SQRT(SUM(b1 * b1))) as affinity_similarity,
|
||||
-- Compute embedding similarity
|
||||
SUM(e1 * e2) / (SQRT(SUM(e1 * e1)) * SQRT(SUM(e2 * e2))) as embedding_similarity
|
||||
FROM item_clusters a
|
||||
JOIN item_clusters b ON a.cluster_id < b.cluster_id
|
||||
JOIN item_embeddings ae ON a.item_id = ae.item_id
|
||||
JOIN item_embeddings be ON b.item_id = be.item_id
|
||||
CROSS JOIN UNNEST(a.cluster_affinities, b.cluster_affinities) AS t1(a1, b1)
|
||||
CROSS JOIN UNNEST(ae.embedding, be.embedding) AS t2(e1, e2)
|
||||
GROUP BY a.cluster_id, b.cluster_id, a.item_id, b.item_id
|
||||
)
|
||||
SELECT
|
||||
corr(center_distance, embedding_distance) as correlation,
|
||||
corr(affinity_similarity, embedding_similarity) as correlation,
|
||||
COUNT(*) as num_pairs
|
||||
FROM distances",
|
||||
&[],
|
||||
@@ -266,7 +255,7 @@ async fn analyze_cluster_correlation(client: &tokio_postgres::Client, args: &Arg
|
||||
let correlation_value: f64 = correlation.get("correlation");
|
||||
let num_pairs: i64 = correlation.get("num_pairs");
|
||||
info!(
|
||||
"Correlation between cluster center distances and embedding distances: {:.3} ({} pairs)",
|
||||
"Correlation between affinity similarities and embedding similarities: {:.3} ({} pairs)",
|
||||
correlation_value, num_pairs
|
||||
);
|
||||
|
||||
|
||||
@@ -72,7 +72,7 @@ async fn main() -> Result<()> {
|
||||
// Load embeddings and cluster information
|
||||
info!("Loading embeddings and cluster information...");
|
||||
let query = format!(
|
||||
"SELECT e.item_id, e.embedding, c.cluster_id
|
||||
"SELECT e.item_id, e.embedding, c.cluster_id, c.cluster_affinities
|
||||
FROM {} e
|
||||
JOIN {} c ON e.item_id = c.item_id
|
||||
ORDER BY e.item_id",
|
||||
@@ -92,22 +92,34 @@ async fn main() -> Result<()> {
|
||||
let mut data = Array2::zeros((n_items, n_dims));
|
||||
let mut item_ids = Vec::with_capacity(n_items);
|
||||
let mut cluster_ids = Vec::with_capacity(n_items);
|
||||
let mut affinity_data = Array2::zeros((n_items, n_dims)); // Changed from n_items to n_dims for affinity dimension
|
||||
|
||||
for (i, row) in rows.iter().enumerate() {
|
||||
let item_id: i32 = row.get(0);
|
||||
let embedding: Vec<f64> = row.get(1);
|
||||
let cluster_id: i32 = row.get(2);
|
||||
let affinities: Vec<f64> = row.get(3);
|
||||
|
||||
item_ids.push(item_id);
|
||||
cluster_ids.push(cluster_id);
|
||||
data.row_mut(i).assign(&ArrayView1::from(&embedding));
|
||||
|
||||
// Ensure affinity vector has the right length by truncating or padding if necessary
|
||||
let mut affinity_vec = vec![0.0; n_dims];
|
||||
for (j, &val) in affinities.iter().take(n_dims).enumerate() {
|
||||
affinity_vec[j] = val;
|
||||
}
|
||||
affinity_data
|
||||
.row_mut(i)
|
||||
.assign(&ArrayView1::from(&affinity_vec));
|
||||
}
|
||||
|
||||
// Perform PCA with 3 components
|
||||
// Perform PCA on both embeddings and affinity vectors
|
||||
info!("Performing PCA...");
|
||||
let projected_data = perform_pca(&data, 3)?;
|
||||
let projected_embeddings = perform_pca(&data, 3)?;
|
||||
let projected_affinities = perform_pca(&affinity_data, 3)?;
|
||||
|
||||
// Create scatter plot for each cluster
|
||||
// Create scatter plot for each cluster using embeddings
|
||||
let mut plot = Plot::new();
|
||||
let unique_clusters: Vec<_> = cluster_ids
|
||||
.iter()
|
||||
@@ -116,24 +128,34 @@ async fn main() -> Result<()> {
|
||||
.into_iter()
|
||||
.collect();
|
||||
|
||||
for cluster_id in unique_clusters {
|
||||
// Plot embeddings
|
||||
for cluster_id in &unique_clusters {
|
||||
let indices: Vec<_> = cluster_ids
|
||||
.iter()
|
||||
.enumerate()
|
||||
.filter(|(_, &c)| c == cluster_id)
|
||||
.filter(|(_, &c)| c == *cluster_id)
|
||||
.map(|(i, _)| i)
|
||||
.collect();
|
||||
|
||||
let x: Vec<_> = indices.iter().map(|&i| projected_data[[i, 0]]).collect();
|
||||
let y: Vec<_> = indices.iter().map(|&i| projected_data[[i, 1]]).collect();
|
||||
let z: Vec<_> = indices.iter().map(|&i| projected_data[[i, 2]]).collect();
|
||||
let x: Vec<_> = indices
|
||||
.iter()
|
||||
.map(|&i| projected_embeddings[[i, 0]])
|
||||
.collect();
|
||||
let y: Vec<_> = indices
|
||||
.iter()
|
||||
.map(|&i| projected_embeddings[[i, 1]])
|
||||
.collect();
|
||||
let z: Vec<_> = indices
|
||||
.iter()
|
||||
.map(|&i| projected_embeddings[[i, 2]])
|
||||
.collect();
|
||||
let text: Vec<_> = indices
|
||||
.iter()
|
||||
.map(|&i| format!("Item {}", item_ids[i]))
|
||||
.collect();
|
||||
|
||||
let trace = Scatter3D::new(x, y, z)
|
||||
.name(&format!("Cluster {}", cluster_id))
|
||||
.name(&format!("Cluster {} (Embeddings)", cluster_id))
|
||||
.mode(Mode::Markers)
|
||||
.text_array(text)
|
||||
.marker(
|
||||
@@ -146,9 +168,51 @@ async fn main() -> Result<()> {
|
||||
plot.add_trace(trace);
|
||||
}
|
||||
|
||||
// Plot affinity vectors
|
||||
for cluster_id in &unique_clusters {
|
||||
let indices: Vec<_> = cluster_ids
|
||||
.iter()
|
||||
.enumerate()
|
||||
.filter(|(_, &c)| c == *cluster_id)
|
||||
.map(|(i, _)| i)
|
||||
.collect();
|
||||
|
||||
let x: Vec<_> = indices
|
||||
.iter()
|
||||
.map(|&i| projected_affinities[[i, 0]])
|
||||
.collect();
|
||||
let y: Vec<_> = indices
|
||||
.iter()
|
||||
.map(|&i| projected_affinities[[i, 1]])
|
||||
.collect();
|
||||
let z: Vec<_> = indices
|
||||
.iter()
|
||||
.map(|&i| projected_affinities[[i, 2]])
|
||||
.collect();
|
||||
let text: Vec<_> = indices
|
||||
.iter()
|
||||
.map(|&i| format!("Item {}", item_ids[i]))
|
||||
.collect();
|
||||
|
||||
let trace = Scatter3D::new(x, y, z)
|
||||
.name(&format!("Cluster {} (Affinities)", cluster_id))
|
||||
.mode(Mode::Markers)
|
||||
.text_array(text)
|
||||
.marker(
|
||||
plotly::common::Marker::new()
|
||||
.size(8)
|
||||
.symbol(plotly::common::MarkerSymbol::Square),
|
||||
)
|
||||
.show_legend(true);
|
||||
|
||||
plot.add_trace(trace);
|
||||
}
|
||||
|
||||
plot.set_layout(
|
||||
Layout::new()
|
||||
.title(Title::new("Item Embeddings Visualization (PCA)"))
|
||||
.title(Title::new(
|
||||
"Item Embeddings and Affinities Visualization (PCA)",
|
||||
))
|
||||
.show_legend(true)
|
||||
.legend(Legend::new().x(1.0).y(0.5))
|
||||
.margin(Margin::new().left(100).right(100).top(100).bottom(100))
|
||||
|
||||
Reference in New Issue
Block a user