-
Notifications
You must be signed in to change notification settings - Fork 0
/
KNN.py
51 lines (46 loc) · 1.85 KB
/
KNN.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
"""Credit: Data Science Pro Youtube. Modifications are mine"""
from collections import Counter
class KNN:
def __init__(self):
self.x = None
self.y = None
def distance(self,p1,p2):
"""Euclidean Distance"""
if len(p1) != len(p2):
raise ValueError('Datapoints are unequal in length')
sum_squares = 0
for i in range(len(p1)):
sum_squares += (p1[i] - p2[i]) ** 2
return sum_squares ** 0.5
def train(self,x,y):
self.x = x
self.y = y
def predict(self,x,k,c):
"""
x is a two-dimensional array, where the number of rows represent the number of datapoints and the number of columns
represent the number of features
y is a one-dimensional array for KNN regression, it is an array of floats, for classification it is an array of integers
calculates the distance between a new datapoint x and every point in the training data
stores the distance as a tuple with the label
Takes mean of neighbor labels in regression
Select most common neighbor label in classification
"""
preds = []
for test_point in x:
distance_label = [
(self.distance(test_point, train_point), train_label)
for train_point, train_label in zip(self.x, self.y)]
distance_label.sort()
neighbors = distance_label[:k]
if c:
neighbors_labels = [label for dist, label in neighbors]
preds.append( Counter(neighbors_labels).most_common()[0][0])
else:
preds.append(sum(label for _, label in neighbors) / k)
return preds
"""
Code the scaler, train_test_split yourself
Try with another dataset
What are your time and space complexities?
How can you optimise them?
"""