k-NN
Definition​
- Definition
- Explanation
- Guidance
- Tips
The k-Nearest Neighbors (k-NN) algorithm is a simple yet powerful supervised machine learning algorithm used for classification and regression tasks. It works by finding the k closest data points in the training set to a given input data point and then making predictions based on the labels or values of those neighboring points
Select the number of neighbors (k) and specify a distance metric. Then, given an input data point, identify the k nearest neighbors using the chosen distance metric. For classification purposes, ascertain the majority class within the k neighbors and designate it as the predicted class for the input point. In regression tasks, compute the average (or weighted average) of the target values of the k neighbors, assigning it as the predicted value. Finally, return the predicted class or value based on the task at hand
- Choose the value of k, which determines how many neighbors will be considered for prediction
- Select a distance metric such as Euclidean distance or Manhattan distance
- For each data point in the dataset
- calculate the distance between the input data point and all other data points using the chosen distance metric
- sort the distances in ascending order and select the k nearest neighbors
- For classification
- determine the majority class among the k neighbors
- assign this class as the predicted class for the input data point
- For regression
- calculate the average (or weighted average) of the target values of the k neighbors
- assign this value as the predicted value for the input data point
- choose an appropriate value of k based on the dataset size and complexity. A smaller k may lead to overfitting, while a larger k may lead to underfitting
- experiment with different distance metrics to find the one that best fits the data distribution
- scaling the features can improve the performance of the algorithm, especially when using distance-based metrics
- consider using weighted voting for classification, where closer neighbors have a higher influence on the prediction
Practice​
- Practice
- Solution
knn_predict(input_point, training_data, k, task_type):
// Step 1: Calculate distances
distances = []
for point in training_data:
distance = calculate_distance(input_point, point)
distances.append((point, distance))
// Step 2: Sort distances
sorted_distances = sort(distances, by=distance_value)
// Step 3: Select k nearest neighbors
nearest_neighbors = sorted_distances[:k]
// Step 4: Perform prediction based on task type
if task_type == "classification":
predicted_class = majority_vote(nearest_neighbors)
return predicted_class
elif task_type == "regression":
predicted_value = calculate_average(nearest_neighbors)
return predicted_value
calculate_distance(point1, point2):
// Implementation of distance calculation (e.g., Euclidean, Manhattan)
// Returns the distance value
distance_value(element):
// Returns the distance value of the element (used for sorting)
majority_vote(neighbors):
// Step 5a: Count occurrences of each class
class_counts = {}
for neighbor in neighbors:
neighbor_class = neighbor[0].class
if neighbor_class in class_counts:
class_counts[neighbor_class] += 1
else:
class_counts[neighbor_class] = 1
// Step 5b: Find the class with the highest count
max_count = 0
majority_class = None
for cls, count in class_counts.items():
if count > max_count:
max_count = count
majority_class = cls
return majority_class
calculate_average(neighbors):
// Step 5: Calculate the average (or weighted average) of target values
total_value = 0
for neighbor in neighbors:
total_value += neighbor[0].value // Assuming the value is stored in neighbor[0]
average_value = total_value / len(neighbors)
return average_value
package main
import (
"math"
"sort"
)
type Point struct {
X float64
Y float64
}
type ByDistance struct {
Points []Point
Target Point
}
func (bd ByDistance) Len() int {
return len(bd.Points)
}
func (bd ByDistance) Swap(i, j int) {
bd.Points[i], bd.Points[j] = bd.Points[j], bd.Points[i]
}
func (bd ByDistance) Less(i, j int) bool {
distI := math.Sqrt(math.Pow(bd.Points[i].X-bd.Target.X, 2) + math.Pow(bd.Points[i].Y-bd.Target.Y, 2))
distJ := math.Sqrt(math.Pow(bd.Points[j].X-bd.Target.X, 2) + math.Pow(bd.Points[j].Y-bd.Target.Y, 2))
return distI < distJ
}
func KNN(data []Point, target Point, k int) []Point {
sortedData := ByDistance{Points: data, Target: target}
sort.Sort(sortedData)
return sortedData.Points[:k]
}
import java.util.ArrayList;
import java.util.Comparator;
import java.util.List;
class Point {
double x, y;
Point(double x, double y) {
this.x = x;
this.y = y;
}
}
public class KNN {
static List<Point> KNN(List<Point> data, Point target, int k) {
data.sort(new ByDistance(target));
return data.subList(0, k);
}
static class ByDistance implements Comparator<Point> {
Point target;
ByDistance(Point target) {
this.target = target;
}
double distance(Point p) {
return Math.sqrt(Math.pow(p.x - target.x, 2) + Math.pow(p.y - target.y, 2));
}
public int compare(Point p1, Point p2) {
return Double.compare(distance(p1), distance(p2));
}
}
}
class Point {
constructor(x, y) {
this.x = x;
this.y = y;
}
}
function distance(p1, p2) {
return Math.sqrt(Math.pow(p1.x - p2.x, 2) + Math.pow(p1.y - p2.y, 2));
}
function KNN(data, target, k) {
data.sort((a, b) => distance(a, target) - distance(b, target));
return data.slice(0, k);
}
data class Point(val x: Double, val y: Double)
class KNN {
companion object {
fun knn(data: List<Point>, target: Point, k: Int): List<Point> {
return data.sortedBy { distance(it, target) }.take(k)
}
private fun distance(p1: Point, p2: Point): Double {
return Math.sqrt(Math.pow(p1.x - p2.x, 2.0) + Math.pow(p1.y - p2.y, 2.0))
}
}
}
import math
class Point:
def __init__(self, x, y):
self.x = x
self.y = y
def distance(p1, p2):
return math.sqrt((p1.x - p2.x) ** 2 + (p1.y - p2.y) ** 2)
def KNN(data, target, k):
sorted_data = sorted(data, key=lambda x: distance(x, target))
return sorted_data[:k]
#[derive(Debug)]
struct Point {
x: f64,
y: f64,
}
fn distance(p1: &Point, p2: &Point) -> f64 {
((p1.x - p2.x).powi(2) + (p1.y - p2.y).powi(2)).sqrt()
}
fn knn(data: &[Point], target: &Point, k: usize) -> Vec<Point> {
let mut sorted_data = data.to_vec();
sorted_data.sort_by(|a, b| distance(a, target).partial_cmp(&distance(b, target)).unwrap());
sorted_data[..k].to_vec()
}
class Point {
constructor(
public x: number,
public y: number,
) {}
}
function distance(p1: Point, p2: Point): number {
return Math.sqrt(Math.pow(p1.x - p2.x, 2) + Math.pow(p1.y - p2.y, 2));
}
function KNN(data: Point[], target: Point, k: number): Point[] {
return data
.sort((a, b) => distance(a, target) - distance(b, target))
.slice(0, k);
}