k-Means

Definition

Definition
Explanation
Guidance
Tips

The k-Means algorithm is an unsupervised machine learning technique used for clustering data into k groups based on similarities in their features. It partitions the data into clusters by minimizing the sum of squared distances between data points and their respective cluster centroids

Practice

Practice
Solution

kMeans(data, k):
    // Step 1: Initialization
    centroids = randomly_initialize_centroids(data, k)
    clusters = []
    repeat:
        // Step 2: Assignment
        clusters = assign_points_to_clusters(data, centroids)
        // Step 3: Update centroids
        centroids = update_centroids(clusters)
    until convergence
    return clusters

randomly_initialize_centroids(data, k):
    centroids = []
    randomly_select k points from data and add them to centroids
    return centroids

assign_points_to_clusters(data, centroids):
    clusters = initialize_empty_clusters(k)
    for each point in data:
        nearest_centroid = find_nearest_centroid(point, centroids)
        add point to nearest_centroid's cluster in clusters
    return clusters

update_centroids(clusters):
    centroids = []
    for each cluster in clusters:
        new_centroid = calculate_mean(cluster.points)
        add new_centroid to centroids
    return centroids

find_nearest_centroid(point, centroids):
    min_distance = infinity
    nearest_centroid = null
    for each centroid in centroids:
        distance = calculate_distance(point, centroid)
        if distance < min_distance:
            min_distance = distance
            nearest_centroid = centroid
    return nearest_centroid

calculate_distance(point1, point2):
    // Euclidean distance
    return sqrt(sum((point1[i] - point2[i])^2) for i in dimensions)

initialize_empty_clusters(k):
    clusters = []
    for i from 1 to k:
        create a new cluster and add it to clusters
    return clusters

package main

import (
    "math"
)

type Point struct {
    X, Y float64
}

type Centroid struct {
    X, Y float64
}

func kMeans(points []Point, k int) []Centroid {
    centroids := make([]Centroid, k)
    for i := range centroids {
        centroids[i] = Centroid{points[i].X, points[i].Y}
    }

    for {
        clusters := make([][]Point, k)
        for _, point := range points {
            nearestIdx := 0
            nearestDist := math.Inf(1)
            for i, centroid := range centroids {
                dist := math.Pow(point.X-centroid.X, 2) + math.Pow(point.Y-centroid.Y, 2)
                if dist < nearestDist {
                    nearestIdx = i
                    nearestDist = dist
                }
            }
            clusters[nearestIdx] = append(clusters[nearestIdx], point)
        }

        newCentroids := make([]Centroid, k)
        for i, cluster := range clusters {
            sumX, sumY := 0.0, 0.0
            for _, point := range cluster {
                sumX += point.X
                sumY += point.Y
            }
            newCentroids[i] = Centroid{sumX / float64(len(cluster)), sumY / float64(len(cluster))}
        }

        if converged(centroids, newCentroids) {
            break
        }
        centroids = newCentroids
    }

    return centroids
}

func converged(oldCentroids, newCentroids []Centroid) bool {
    for i := range oldCentroids {
        if math.Abs(oldCentroids[i].X-newCentroids[i].X) > 1e-6 || math.Abs(oldCentroids[i].Y-newCentroids[i].Y) > 1e-6 {
            return false
        }
    }
    return true
}

import java.util.ArrayList;
import java.util.List;

class Point {

  double x, y;

  Point(double x, double y) {
    this.x = x;
    this.y = y;
  }
}

class Centroid {

  double x, y;

  Centroid(double x, double y) {
    this.x = x;
    this.y = y;
  }
}

public class KMeans {

  public static List<Centroid> kMeans(List<Point> points, int k) {
    List<Centroid> centroids = new ArrayList<>();
    for (int i = 0; i < k; i++) {
      centroids.add(new Centroid(points.get(i).x, points.get(i).y));
    }

    while (true) {
      List<List<Point>> clusters = new ArrayList<>();
      for (int i = 0; i < k; i++) {
        clusters.add(new ArrayList<>());
      }

      for (Point point : points) {
        int nearestIdx = 0;
        double nearestDist = Double.POSITIVE_INFINITY;
        for (int i = 0; i < k; i++) {
          Centroid centroid = centroids.get(i);
          double dist = Math.pow(point.x - centroid.x, 2) + Math.pow(point.y - centroid.y, 2);
          if (dist < nearestDist) {
            nearestIdx = i;
            nearestDist = dist;
          }
        }
        clusters.get(nearestIdx).add(point);
      }

      List<Centroid> newCentroids = new ArrayList<>();
      for (List<Point> cluster : clusters) {
        double sumX = 0, sumY = 0;
        for (Point point : cluster) {
          sumX += point.x;
          sumY += point.y;
        }
        newCentroids.add(new Centroid(sumX / cluster.size(), sumY / cluster.size()));
      }

      if (converged(centroids, newCentroids)) {
        break;
      }
      centroids = newCentroids;
    }

    return centroids;
  }

  public static boolean converged(List<Centroid> oldCentroids, List<Centroid> newCentroids) {
    for (int i = 0; i < oldCentroids.size(); i++) {
      if (Math.abs(oldCentroids.get(i).x - newCentroids.get(i).x) > 1e-6 ||
          Math.abs(oldCentroids.get(i).y - newCentroids.get(i).y) > 1e-6) {
        return false;
      }
    }
    return true;
  }
}

class Point {
  constructor(x, y) {
    this.x = x;
    this.y = y;
  }
}

class Centroid {
  constructor(x, y) {
    this.x = x;
    this.y = y;
  }
}

function kMeans(points, k) {
  let centroids = [];
  for (let i = 0; i < k; i++) {
    centroids.push(new Centroid(points[i].x, points[i].y));
  }

  while (true) {
    let clusters = new Array(k).fill().map(() => []);

    for (let point of points) {
      let nearestIdx = 0;
      let nearestDist = Infinity;
      for (let i = 0; i < k; i++) {
        let centroid = centroids[i];
        let dist =
          Math.pow(point.x - centroid.x, 2) + Math.pow(point.y - centroid.y, 2);
        if (dist < nearestDist) {
          nearestIdx = i;
          nearestDist = dist;
        }
      }
      clusters[nearestIdx].push(point);
    }

    let newCentroids = [];
    for (let cluster of clusters) {
      let sumX = 0,
        sumY = 0;
      for (let point of cluster) {
        sumX += point.x;
        sumY += point.y;
      }
      newCentroids.push(
        new Centroid(sumX / cluster.length, sumY / cluster.length),
      );
    }

    if (converged(centroids, newCentroids)) {
      break;
    }
    centroids = newCentroids;
  }

  return centroids;
}

function converged(oldCentroids, newCentroids) {
  for (let i = 0; i < oldCentroids.length; i++) {
    if (
      Math.abs(oldCentroids[i].x - newCentroids[i].x) > 1e-6 ||
      Math.abs(oldCentroids[i].y - newCentroids[i].y) > 1e-6
    ) {
      return false;
    }
  }
  return true;
}

data class Point(val x: Double, val y: Double)

data class Centroid(var x: Double, var y: Double)

fun kMeans(points: List<Point>, k: Int): List<Centroid> {
    var centroids = points.take(k).map { Centroid(it.x, it.y) }

    while (true) {
        val clusters = List(k) { mutableListOf<Point>() }

        for (point in points) {
            var nearestIdx = 0
            var nearestDist = Double.POSITIVE_INFINITY
            for ((idx, centroid) in centroids.withIndex()) {
                val dist = Math.pow(point.x - centroid.x, 2.0) + Math.pow(point.y - centroid.y, 2.0)
                if (dist < nearestDist) {
                    nearestIdx = idx
                    nearestDist = dist
                }
            }
            clusters[nearestIdx].add(point)
        }

        val newCentroids = mutableListOf<Centroid>()
        for ((idx, cluster) in clusters.withIndex()) {
            val sumX = cluster.sumByDouble { it.x }
            val sumY = cluster.sumByDouble { it.y }
            newCentroids.add(Centroid(sumX / cluster.size, sumY / cluster.size))
        }

        if (converged(centroids, newCentroids)) {
            break
        }
        centroids = newCentroids
    }

    return centroids
}

fun converged(oldCentroids: List<Centroid>, newCentroids: List<Centroid>): Boolean {
    return oldCentroids.zip(newCentroids).all { (old, new) ->
        Math.abs(old.x - new.x) < 1e-6 && Math.abs(old.y - new.y) < 1e-6
    }
}

class Point:
    def __init__(self, x, y):
        self.x = x
        self.y = y

class Centroid:
    def __init__(self, x, y):
        self.x = x
        self.y = y

def k_means(points, k):
    centroids = [Centroid(point.x, point.y) for point in points[:k]]

    while True:
        clusters = [[] for _ in range(k)]

        for point in points:
            nearest_idx = 0
            nearest_dist = float('inf')
            for i, centroid in enumerate(centroids):
                dist = (point.x - centroid.x) ** 2 + (point.y - centroid.y) ** 2
                if dist < nearest_dist:
                    nearest_idx = i
                    nearest_dist = dist
            clusters[nearest_idx].append(point)

        new_centroids = []
        for cluster in clusters:
            sum_x = sum(point.x for point in cluster)
            sum_y = sum(point.y for point in cluster)
            new_centroids.append(Centroid(sum_x / len(cluster), sum_y / len(cluster)))

        if converged(centroids, new_centroids):
            break
        centroids = new_centroids

    return centroids

def converged(old_centroids, new_centroids):
    return all(abs(old.x - new.x) < 1e-6 and abs(old.y - new.y) < 1e-6 for old, new in zip(old_centroids, new_centroids))

#[derive(Clone, Copy)]
struct Point {
    x: f64,
    y: f64,
}

#[derive(Clone, Copy)]
struct Centroid {
    x: f64,
    y: f64,
}

fn k_means(points: &[Point], k: usize) -> Vec<Centroid> {
    let mut centroids: Vec<Centroid> = points.iter().take(k).map(|p| Centroid { x: p.x, y: p.y }).collect();

    loop {
        let mut clusters: Vec<Vec<Point>> = vec![Vec::new(); k];

        for point in points {
            let mut nearest_idx = 0;
            let mut nearest_dist = f64::INFINITY;
            for (i, centroid) in centroids.iter().enumerate() {
                let dist = (point.x - centroid.x).powi(2) + (point.y - centroid.y).powi(2);
                if dist < nearest_dist {
                    nearest_idx = i;
                    nearest_dist = dist;
                }
            }
            clusters[nearest_idx].push(*point);
        }

        let mut new_centroids: Vec<Centroid> = Vec::new();
        for cluster in &clusters {
            let sum_x: f64 = cluster.iter().map(|p| p.x).sum();
            let sum_y: f64 = cluster.iter().map(|p| p.y).sum();
            new_centroids.push(Centroid { x: sum_x / cluster.len() as f64, y: sum_y / cluster.len() as f64 });
        }

        if converged(&centroids, &new_centroids) {
            break;
        }
        centroids = new_centroids;
    }

    centroids
}

fn converged(old_centroids: &[Centroid], new_centroids: &[Centroid]) -> bool {
    old_centroids.iter().zip(new_centroids.iter()).all(|(old, new)| {
        (old.x - new.x).abs() < 1e-6 && (old.y - new.y).abs() < 1e-6
    })
}

class Point {
  constructor(
    public x: number,
    public y: number,
  ) {}
}

class Centroid {
  constructor(
    public x: number,
    public y: number,
  ) {}
}

function kMeans(points: Point[], k: number): Centroid[] {
  let centroids: Centroid[] = points
    .slice(0, k)
    .map((p) => new Centroid(p.x, p.y));

  while (true) {
    let clusters: Point[][] = new Array(k).fill(null).map(() => []);

    for (let point of points) {
      let nearestIdx = 0;
      let nearestDist = Infinity;
      for (let i = 0; i < k; i++) {
        let centroid = centroids[i];
        let dist =
          Math.pow(point.x - centroid.x, 2) + Math.pow(point.y - centroid.y, 2);
        if (dist < nearestDist) {
          nearestIdx = i;
          nearestDist = dist;
        }
      }
      clusters[nearestIdx].push(point);
    }

    let newCentroids: Centroid[] = [];
    for (let cluster of clusters) {
      let sumX = cluster.reduce((acc, p) => acc + p.x, 0);
      let sumY = cluster.reduce((acc, p) => acc + p.y, 0);
      newCentroids.push(
        new Centroid(sumX / cluster.length, sumY / cluster.length),
      );
    }

    if (converged(centroids, newCentroids)) {
      break;
    }
    centroids = newCentroids;
  }

  return centroids;
}

function converged(
  oldCentroids: Centroid[],
  newCentroids: Centroid[],
): boolean {
  return oldCentroids.every(
    (old, i) =>
      Math.abs(old.x - newCentroids[i].x) < 1e-6 &&
      Math.abs(old.y - newCentroids[i].y) < 1e-6,
  );
}

Definition​

Practice​

Definition

Practice