k-Means
Definition​
- Definition
- Explanation
- Guidance
- Tips
The k-Means algorithm is an unsupervised machine learning technique used for clustering data into k groups based on similarities in their features. It partitions the data into clusters by minimizing the sum of squared distances between data points and their respective cluster centroids
The k-Means algorithm begins by randomly initializing k cluster centroids within the data space. Next, each data point is assigned to the nearest centroid, thereby forming k clusters. After the initial assignment, the centroids are recalculated as the mean of all data points assigned to each cluster. This process of assignment and centroid recalculation is repeated iteratively until convergence is achieved, meaning that the centroids no longer change significantly or a specified number of iterations is reached
- Choose the number of clusters, k
- Initialize k centroids randomly within the data space
- Repeat until convergence
- Assign each data point to the nearest centroid
- Update the centroids by calculating the mean of all points assigned to each centroid
- Repeat until convergence or a maximum number of iterations is reached
- it's essential to select an appropriate value for k, typically determined using domain knowledge or techniques like the elbow method
- initialization of centroids can affect the final clustering, so multiple random initializations and selecting the best result can improve performance
- pay attention to the choice of distance metric, usually Euclidean distance, but other metrics may be more suitable for certain types of data
Practice​
- Practice
- Solution
kMeans(data, k):
// Step 1: Initialization
centroids = randomly_initialize_centroids(data, k)
clusters = []
repeat:
// Step 2: Assignment
clusters = assign_points_to_clusters(data, centroids)
// Step 3: Update centroids
centroids = update_centroids(clusters)
until convergence
return clusters
randomly_initialize_centroids(data, k):
centroids = []
randomly_select k points from data and add them to centroids
return centroids
assign_points_to_clusters(data, centroids):
clusters = initialize_empty_clusters(k)
for each point in data:
nearest_centroid = find_nearest_centroid(point, centroids)
add point to nearest_centroid's cluster in clusters
return clusters
update_centroids(clusters):
centroids = []
for each cluster in clusters:
new_centroid = calculate_mean(cluster.points)
add new_centroid to centroids
return centroids
find_nearest_centroid(point, centroids):
min_distance = infinity
nearest_centroid = null
for each centroid in centroids:
distance = calculate_distance(point, centroid)
if distance < min_distance:
min_distance = distance
nearest_centroid = centroid
return nearest_centroid
calculate_distance(point1, point2):
// Euclidean distance
return sqrt(sum((point1[i] - point2[i])^2) for i in dimensions)
initialize_empty_clusters(k):
clusters = []
for i from 1 to k:
create a new cluster and add it to clusters
return clusters
package main
import (
"math"
)
type Point struct {
X, Y float64
}
type Centroid struct {
X, Y float64
}
func kMeans(points []Point, k int) []Centroid {
centroids := make([]Centroid, k)
for i := range centroids {
centroids[i] = Centroid{points[i].X, points[i].Y}
}
for {
clusters := make([][]Point, k)
for _, point := range points {
nearestIdx := 0
nearestDist := math.Inf(1)
for i, centroid := range centroids {
dist := math.Pow(point.X-centroid.X, 2) + math.Pow(point.Y-centroid.Y, 2)
if dist < nearestDist {
nearestIdx = i
nearestDist = dist
}
}
clusters[nearestIdx] = append(clusters[nearestIdx], point)
}
newCentroids := make([]Centroid, k)
for i, cluster := range clusters {
sumX, sumY := 0.0, 0.0
for _, point := range cluster {
sumX += point.X
sumY += point.Y
}
newCentroids[i] = Centroid{sumX / float64(len(cluster)), sumY / float64(len(cluster))}
}
if converged(centroids, newCentroids) {
break
}
centroids = newCentroids
}
return centroids
}
func converged(oldCentroids, newCentroids []Centroid) bool {
for i := range oldCentroids {
if math.Abs(oldCentroids[i].X-newCentroids[i].X) > 1e-6 || math.Abs(oldCentroids[i].Y-newCentroids[i].Y) > 1e-6 {
return false
}
}
return true
}
import java.util.ArrayList;
import java.util.List;
class Point {
double x, y;
Point(double x, double y) {
this.x = x;
this.y = y;
}
}
class Centroid {
double x, y;
Centroid(double x, double y) {
this.x = x;
this.y = y;
}
}
public class KMeans {
public static List<Centroid> kMeans(List<Point> points, int k) {
List<Centroid> centroids = new ArrayList<>();
for (int i = 0; i < k; i++) {
centroids.add(new Centroid(points.get(i).x, points.get(i).y));
}
while (true) {
List<List<Point>> clusters = new ArrayList<>();
for (int i = 0; i < k; i++) {
clusters.add(new ArrayList<>());
}
for (Point point : points) {
int nearestIdx = 0;
double nearestDist = Double.POSITIVE_INFINITY;
for (int i = 0; i < k; i++) {
Centroid centroid = centroids.get(i);
double dist = Math.pow(point.x - centroid.x, 2) + Math.pow(point.y - centroid.y, 2);
if (dist < nearestDist) {
nearestIdx = i;
nearestDist = dist;
}
}
clusters.get(nearestIdx).add(point);
}
List<Centroid> newCentroids = new ArrayList<>();
for (List<Point> cluster : clusters) {
double sumX = 0, sumY = 0;
for (Point point : cluster) {
sumX += point.x;
sumY += point.y;
}
newCentroids.add(new Centroid(sumX / cluster.size(), sumY / cluster.size()));
}
if (converged(centroids, newCentroids)) {
break;
}
centroids = newCentroids;
}
return centroids;
}
public static boolean converged(List<Centroid> oldCentroids, List<Centroid> newCentroids) {
for (int i = 0; i < oldCentroids.size(); i++) {
if (Math.abs(oldCentroids.get(i).x - newCentroids.get(i).x) > 1e-6 ||
Math.abs(oldCentroids.get(i).y - newCentroids.get(i).y) > 1e-6) {
return false;
}
}
return true;
}
}
class Point {
constructor(x, y) {
this.x = x;
this.y = y;
}
}
class Centroid {
constructor(x, y) {
this.x = x;
this.y = y;
}
}
function kMeans(points, k) {
let centroids = [];
for (let i = 0; i < k; i++) {
centroids.push(new Centroid(points[i].x, points[i].y));
}
while (true) {
let clusters = new Array(k).fill().map(() => []);
for (let point of points) {
let nearestIdx = 0;
let nearestDist = Infinity;
for (let i = 0; i < k; i++) {
let centroid = centroids[i];
let dist =
Math.pow(point.x - centroid.x, 2) + Math.pow(point.y - centroid.y, 2);
if (dist < nearestDist) {
nearestIdx = i;
nearestDist = dist;
}
}
clusters[nearestIdx].push(point);
}
let newCentroids = [];
for (let cluster of clusters) {
let sumX = 0,
sumY = 0;
for (let point of cluster) {
sumX += point.x;
sumY += point.y;
}
newCentroids.push(
new Centroid(sumX / cluster.length, sumY / cluster.length),
);
}
if (converged(centroids, newCentroids)) {
break;
}
centroids = newCentroids;
}
return centroids;
}
function converged(oldCentroids, newCentroids) {
for (let i = 0; i < oldCentroids.length; i++) {
if (
Math.abs(oldCentroids[i].x - newCentroids[i].x) > 1e-6 ||
Math.abs(oldCentroids[i].y - newCentroids[i].y) > 1e-6
) {
return false;
}
}
return true;
}
data class Point(val x: Double, val y: Double)
data class Centroid(var x: Double, var y: Double)
fun kMeans(points: List<Point>, k: Int): List<Centroid> {
var centroids = points.take(k).map { Centroid(it.x, it.y) }
while (true) {
val clusters = List(k) { mutableListOf<Point>() }
for (point in points) {
var nearestIdx = 0
var nearestDist = Double.POSITIVE_INFINITY
for ((idx, centroid) in centroids.withIndex()) {
val dist = Math.pow(point.x - centroid.x, 2.0) + Math.pow(point.y - centroid.y, 2.0)
if (dist < nearestDist) {
nearestIdx = idx
nearestDist = dist
}
}
clusters[nearestIdx].add(point)
}
val newCentroids = mutableListOf<Centroid>()
for ((idx, cluster) in clusters.withIndex()) {
val sumX = cluster.sumByDouble { it.x }
val sumY = cluster.sumByDouble { it.y }
newCentroids.add(Centroid(sumX / cluster.size, sumY / cluster.size))
}
if (converged(centroids, newCentroids)) {
break
}
centroids = newCentroids
}
return centroids
}
fun converged(oldCentroids: List<Centroid>, newCentroids: List<Centroid>): Boolean {
return oldCentroids.zip(newCentroids).all { (old, new) ->
Math.abs(old.x - new.x) < 1e-6 && Math.abs(old.y - new.y) < 1e-6
}
}
class Point:
def __init__(self, x, y):
self.x = x
self.y = y
class Centroid:
def __init__(self, x, y):
self.x = x
self.y = y
def k_means(points, k):
centroids = [Centroid(point.x, point.y) for point in points[:k]]
while True:
clusters = [[] for _ in range(k)]
for point in points:
nearest_idx = 0
nearest_dist = float('inf')
for i, centroid in enumerate(centroids):
dist = (point.x - centroid.x) ** 2 + (point.y - centroid.y) ** 2
if dist < nearest_dist:
nearest_idx = i
nearest_dist = dist
clusters[nearest_idx].append(point)
new_centroids = []
for cluster in clusters:
sum_x = sum(point.x for point in cluster)
sum_y = sum(point.y for point in cluster)
new_centroids.append(Centroid(sum_x / len(cluster), sum_y / len(cluster)))
if converged(centroids, new_centroids):
break
centroids = new_centroids
return centroids
def converged(old_centroids, new_centroids):
return all(abs(old.x - new.x) < 1e-6 and abs(old.y - new.y) < 1e-6 for old, new in zip(old_centroids, new_centroids))
#[derive(Clone, Copy)]
struct Point {
x: f64,
y: f64,
}
#[derive(Clone, Copy)]
struct Centroid {
x: f64,
y: f64,
}
fn k_means(points: &[Point], k: usize) -> Vec<Centroid> {
let mut centroids: Vec<Centroid> = points.iter().take(k).map(|p| Centroid { x: p.x, y: p.y }).collect();
loop {
let mut clusters: Vec<Vec<Point>> = vec![Vec::new(); k];
for point in points {
let mut nearest_idx = 0;
let mut nearest_dist = f64::INFINITY;
for (i, centroid) in centroids.iter().enumerate() {
let dist = (point.x - centroid.x).powi(2) + (point.y - centroid.y).powi(2);
if dist < nearest_dist {
nearest_idx = i;
nearest_dist = dist;
}
}
clusters[nearest_idx].push(*point);
}
let mut new_centroids: Vec<Centroid> = Vec::new();
for cluster in &clusters {
let sum_x: f64 = cluster.iter().map(|p| p.x).sum();
let sum_y: f64 = cluster.iter().map(|p| p.y).sum();
new_centroids.push(Centroid { x: sum_x / cluster.len() as f64, y: sum_y / cluster.len() as f64 });
}
if converged(¢roids, &new_centroids) {
break;
}
centroids = new_centroids;
}
centroids
}
fn converged(old_centroids: &[Centroid], new_centroids: &[Centroid]) -> bool {
old_centroids.iter().zip(new_centroids.iter()).all(|(old, new)| {
(old.x - new.x).abs() < 1e-6 && (old.y - new.y).abs() < 1e-6
})
}
class Point {
constructor(
public x: number,
public y: number,
) {}
}
class Centroid {
constructor(
public x: number,
public y: number,
) {}
}
function kMeans(points: Point[], k: number): Centroid[] {
let centroids: Centroid[] = points
.slice(0, k)
.map((p) => new Centroid(p.x, p.y));
while (true) {
let clusters: Point[][] = new Array(k).fill(null).map(() => []);
for (let point of points) {
let nearestIdx = 0;
let nearestDist = Infinity;
for (let i = 0; i < k; i++) {
let centroid = centroids[i];
let dist =
Math.pow(point.x - centroid.x, 2) + Math.pow(point.y - centroid.y, 2);
if (dist < nearestDist) {
nearestIdx = i;
nearestDist = dist;
}
}
clusters[nearestIdx].push(point);
}
let newCentroids: Centroid[] = [];
for (let cluster of clusters) {
let sumX = cluster.reduce((acc, p) => acc + p.x, 0);
let sumY = cluster.reduce((acc, p) => acc + p.y, 0);
newCentroids.push(
new Centroid(sumX / cluster.length, sumY / cluster.length),
);
}
if (converged(centroids, newCentroids)) {
break;
}
centroids = newCentroids;
}
return centroids;
}
function converged(
oldCentroids: Centroid[],
newCentroids: Centroid[],
): boolean {
return oldCentroids.every(
(old, i) =>
Math.abs(old.x - newCentroids[i].x) < 1e-6 &&
Math.abs(old.y - newCentroids[i].y) < 1e-6,
);
}