K临近算法原理
K临近算法(K-Nearest Neighbor, KNN)是最简单的监督学习分类算法之一。(有之一吗?)
对于一个应用样本点,K临近算法寻找距它最近的k个训练样本点即K个Nearest Neighbor。
若在K个邻居中属于某一类别的最多,则认为应用样本点也属于该类别。
KNN算法Python实现
KNN算法无需训练,很容易实现。
from numpy import *import operatorclass KNNClassifier(): def __init__(self): self.dataSet = [] self.labels = [] def loadDataSet(self,filename): fr = open(filename) for line in fr.readlines(): lineArr = line.strip().split() dataLine = list() for i in lineArr: dataLine.append(float(i)) label = dataLine.pop() # pop the last column referring to label self.dataSet.append(dataLine) self.labels.append(int(label)) def setDataSet(self, dataSet, labels): self.dataSet = dataSet self.labels = labels def classify(self, data, k): self.dataSet = array(self.dataSet) self.labels = array(self.labels) self._normDataSet() dataSetSize = self.dataSet.shape[0] # get distance diffMat = tile(data, (dataSetSize,1)) - self.dataSet sqDiffMat = diffMat**2 distances = sqDiffMat.sum(axis=1) # get K nearest neighbors sortedDistIndicies = distances.argsort() classCount= {} for i in range(k): voteIlabel = self.labels[sortedDistIndicies[i]] classCount[voteIlabel] = classCount.get(voteIlabel,0) + 1 # get fittest label sortedClassCount = sorted(classCount.iteritems(), key=operator.itemgetter(1), reverse=True) return sortedClassCount[0][0] def _normDataSet(self): minVals = self.dataSet.min(0) maxVals = self.dataSet.max(0) ranges = maxVals - minVals normDataSet = zeros(shape(self.dataSet)) m = self.dataSet.shape[0] normDataSet = self.dataSet - tile(minVals, (m,1)) normDataSet = normDataSet/tile(ranges, (m,1)) #element wise divide self.dataSet = normDataSet def test(self): self.dataSet = array([[1.0,1.1],[1.0,1.0],[0.9,0.9],[0,0],[0,0.1],[0,0.2]]) self.labels = [1,1,1,2,2,2] print(self.classify([1.0,1.1], 2))if __name__ == '__main__': KNN = KNNClassifier() KNN.loadDataSet('testData.txt') print(KNN.classify([72011, 4.932976, 0.632026], 5) ) # KNN.test()