kNN--近邻算法

佚名 8年前 (2018-12-08) 人工智能 724人围观抢沙发百度已收录

kNN--近邻算法

kNN算法的核心思想是如果一个样本在特征空间中的k个最相邻的样本中的大多数属于某一个类别，则该样本也属于这个类别，并具有这个类别上样本的特性。

在机器学习中常用于分类。

SRE实战互联网时代守护先锋，助力企业售后服务体系运筹帷幄！一键直达领取阿里云限量特价优惠。

数学内容：

欧氏距离公式，矩阵运算，归一化数值

python模块：

numpy，operator（用其中的itemgetter做排序），listdir（列出目录中的文件），matplotlib.pyplot（可视化数据分析数据），

PIL（对图片进行处理）

from numpy import *
import operator
from os import listdir

def createDataSet():
    groups=array([[1.0,1.1],[1.0,1.0],[0,0],[0,0.1]])
    lables=['A','A','B','B']
    return groups,lables

#k-近邻算法
def classify0(inX, dataset,labels,k):
    #获取样本集中有几组数据
    datasetSize=dataset.shape[0]
    #欧氏距离公式 计算距离
    diffMat=tile(inX, (datasetSize, 1)) - dataset
    sqDiffMat=diffMat**2
    sqDistances=sqDiffMat.sum(axis=1)
    distances=sqDistances**0.5
    #按距离递增排列，返回样本集中的index
    sortedDistances=distances.argsort()
    classCount={}
    for i in range(k):
        #根据距离递增的顺序，获取与其对应的类别(即目标变量)
        voteIlabel=labels[sortedDistances[i]]
        #为k个元素所在的分类计数
        classCount[voteIlabel]=classCount.get(voteIlabel,0)+1
    #通过对比每个类别出现的次数(即classCount value)，以递减的顺序排序    
    sortedCount=sorted(classCount.items(),key=operator.itemgetter(1),reverse=True)
    #返回计数最大的那个类别的值
    return sortedCount[0][0]    

#准备数据
def file2matrix(filename):
    fr=open(filename)
    arrayOLines=fr.readlines()
    #获取文件行数
    numberOflines=len(arrayOLines)
    #创建一个以文件行数为行，3列的矩阵
    returnMatrix=zeros((numberOflines,3))
    #定义一个存放目标变量(类别)的数组
    classLabelVector=[]
    index=0
    #遍历文件
    for line in arrayOLines:
        line=line.strip()
        listFromLine=line.split('\t')
        #把文件前三列添加到返回的矩阵中
        returnMatrix[index:]=listFromLine[0:3]
        #文件最后一列(对应的类别)添加到类别数组中
        classLabelVector.append(int(listFromLine[-1]))
        index+=1
    #返回数据特征矩阵和类别数组    
    return returnMatrix,classLabelVector

#通过公式 "newValue=(oldValue-min)/(max-min)" 将任意取值范围的特征值转化为0到1区间内的值
def autoNorm(dataset):
    #返回每列的最小值
    minVals=dataset.min(0)
    #返回每列的最大值
    maxVals=dataset.max(0)
    #返回最大值与最小值的差
    ranges=maxVals-minVals
    #创建与dataset同行同列的0矩阵
    normDataset=zeros(shape(dataset))
    #返回dataset的行数
    m=dataset.shape[0]
    #创建一个重复m次的minVals矩阵，并与dataset相减
    normDataset=dataset-tile(minVals,(m,1))
    #newValue=(oldValue-min)/(max-min)
    normDataset=normDataset/tile(ranges,(m,1))
    return normDataset,ranges,minVals    

#测试算法
def datingClassTest():
    #设定测试数据比例
    hoRatio=0.10
    #返回格式化后的数据和其标签
    datingDataMat,datingLabels=file2matrix('datingTestSet2.txt')
    #归一化数据值
    normMat,ranges,minVals=autoNorm(datingDataMat)
    #数据的行数
    m=normMat.shape[0]
    #测试数据的行数
    numTestVecs=int(m*hoRatio)
    #设置错误预测计数器
    errorCount=0.0
    #向k-近邻算法中传numTestVecs个测试数据，并把返回的预测数据与真实数据比较返回，若错误，计数器加1
    for i in range(numTestVecs):
        """
        调用k-近邻算法，为其传入参数，
        normMat[i]：第i个测试数据，
        normMat[numTestVecs:m,:]：从numTestVecs到m个样本数据,（m可以不写，相当于从numTestVecs索引开始，取剩下所有的normMat数据）
        datingLabels[numTestVecs:m]：从numTestVecs到m个样本数据对应的标签
        3：k的值
        """
        classifierResult=classify0(normMat[i],normMat[numTestVecs:m,:],datingLabels[numTestVecs:m],3)
        #判断预测数据与真实数据，如果是错误的，则以红字体输出，并错误预测计数器加1
        if (classifierResult!=datingLabels[i]): 
            print("\033[0;31mthe classifier came back with: %d, the real answer is: %d\033[0m" % (classifierResult, datingLabels[i]))
            errorCount+=1.0
        else:
            print("the classifier came back with: %d, the real answer is: %d" % (classifierResult, datingLabels[i]))
    print("the total error rate is:%f" %(errorCount/float(numTestVecs)))

#约会系统
def classifiyPerson():
    #设定分类（标签）列表
    resultList=["not at all", "in small doses", "in large doses"]
    #提示用户输入相应内容
    percentTats=float(input("percentage of time spent playing video games?"))
    ffMiles=float(input("frequent filer miles earned per year?"))
    iceCream=float(input("liters of ice cream consumed per year?"))
    #把用户输入的三个特征值格式化成numpy.array数据类型
    inArr=array([ffMiles,percentTats,iceCream])
    #准备样本数据及对应标签
    datingDataMat,datingLabels=file2matrix('datingTestSet2.txt')
    #归一化样本数据并返回ranges和minVals，以便归一化用户输入的数据
    normMat,ranges,minVals=autoNorm(datingDataMat)
    #调用k-近邻算法，并把传入的预测数据特征做归一化
    classifierResult=classify0((inArr-minVals)/ranges,normMat,datingLabels,3)
    #打印出预测出的类别，因为样本数据中的类别（标签）为1，2，3，其index是0，1，2，所以要用预测出的分类（1，2，3）减1
    print("You will probably like this person: %s" %(resultList[classifierResult-1]))

#将32x32的二进制图像文件转换成1x1024的向量
def img2vector(filename):
    #创建一个1x1024的0矩阵
    returnVect=zeros((1,1024))
    fr=open(filename)
    """
    因为已知文件是32x32，即有文件中32行内容，通过readline()方法遍历文件，得到文件的每行内容lineStr
    再遍历每行内容lineStr，并把遍历出的内容添加到returnVect矩阵里
    """
    for i in range(32):
        lineStr=fr.readline()
        for j in range(32):
            returnVect[0,32*i+j]=int(lineStr[j])
    return returnVect

#手写数字识别系统
def handwritingClassTest():
    #创建数据标签集合
    hwLabels=[]
    #列出目录冲所有文件
    trainingFileList=listdir('digits/trainingDigits')
    #得到文件个数，也就是训练数据的行数
    m=len(trainingFileList)
    #创建一个m行，1024列的0矩阵
    trainingMat=zeros((m,1024))
    """
    通过遍历所有训练文件，得到文件名，其对应的数字(eg:0_7.txt),并把数字添加到hwLabels集合，
    通过上面的img2vector函数，得到一个与该文件对应的1x1024矩阵，并添加到trainingMat矩阵中
    """
    for i in range(m):
        fileNameStr=trainingFileList[i]
        fileStr=fileNameStr.split('.')[0]
        classNumStr=int(fileStr.split('_')[0])
        hwLabels.append(classNumStr)
        trainingMat[i,:]=img2vector('digits/trainingDigits/%s' % fileNameStr)
    #对测试数据做同样的操作    
    testFileList=listdir('digits/testDigits')
    mTest=len(testFileList)
    errorCount=0.0
    for i in range(mTest):
        fileNameStr=testFileList[i]
        fileStr=fileNameStr.split('.')[0]
        classNumStr=int(fileStr.split('_')[0])
        vectorUnderTest=img2vector('digits/testDigits/%s' % fileNameStr)
        classifierResult=classify0(vectorUnderTest,trainingMat,hwLabels,3)
        if (classifierResult!=classNumStr):
            print("\033[0;31mthe classifier came back with: %d, the real answer is: %d\033[0m" % (classifierResult,classNumStr))
            errorCount+=1
        else:
            print("the classifier came back with: %d, the real answer is: %d" %(classifierResult,classNumStr))
    print("\nthe total number of errors is: %d" % errorCount)
    print("\nthe total error rate is: %f" %(errorCount/float(mTest)))

#在网上找数字图片做测试
def imgNumClassTest(filename):
    hwLabels=[]
    trainingFileList=listdir('digits/trainingDigits')
    m=len(trainingFileList)
    trainingMat=zeros((m,1024))
    for i in range(m):
        fileNameStr=trainingFileList[i]
        fileStr=fileNameStr.split('.')[0]
        classNumStr=int(fileStr.split('_')[0])
        hwLabels.append(classNumStr)
        trainingMat[i,:]=img2vector('digits/trainingDigits/%s' % fileNameStr)
    vectorUnderTest=img2vector(filename)
    classifierResult=classify0(vectorUnderTest,trainingMat,hwLabels,3)
    print(classifierResult)

约会网站案列数据分析代码：

"""
分析数据
"""

import kNN
from numpy import *
import matplotlib
import matplotlib.pyplot as plt

datingDataMat,datingLabels=kNN.file2matrix('datingTestSet2.txt')
#创建一个图片窗口，默认是1(figure1)
fig=plt.figure()
#在图片窗口创建两行一列的子图，并使用第一行第一列，即211的含义
ax1=fig.add_subplot(211)
"""
创建散点图，x轴是datingDataMat第一列的数据，y轴是datinDataMat第二列的数据，
后面两个参数一个代表颜色，一个代表点的大小，两个参数同时放大1５倍，然后这个时候就是同一个label用一种颜色和大小表示出来，
不同的label的点的大小和颜色会不一样。
"""
ax1.scatter(datingDataMat[:,1],datingDataMat[:,2],15*array(datingLabels),15*array(datingLabels))
#设置x轴标签
plt.xlabel('Play game takes time')
#设置y轴标签
plt.ylabel('Eat ice-cream')

#在图片窗口中使用第一行第二列
ax2=fig.add_subplot(212)
#把datingLabels转成numpy.array类型
datingLabels=array(datingLabels)
#取datingLabels中值等于1的index
idx_1=where(datingLabels==1)
#idx_1即datingTestSet2.txt文件中第四列值为1的行数，则获取idx_1行，第一二列的数据创建散点图，为这些点设置颜色，大小，label
p1=ax2.scatter(datingDataMat[idx_1,0],datingDataMat[idx_1,1],color = 'm', label='Hate', s = 50)
idx_2=where(datingLabels==2)
p2=ax2.scatter(datingDataMat[idx_2,0],datingDataMat[idx_2,1],color = 'c', label='General', s = 30)
idx_3=where(datingLabels==3)
p3=ax2.scatter(datingDataMat[idx_3,0],datingDataMat[idx_3,1],color = 'r', label='Like', s = 10)
plt.xlabel('Flying')
plt.ylabel('Play game takes time')
#创建图示放置在左上角
plt.legend(loc='upper left')
#显示图片
plt.show()

手写数字识别系统图片转文本文件代码：

from PIL import Image
import numpy as np
import matplotlib.pyplot as plt


def img2txt(img_path, txt_name):
    """
    将图像数据转换为txt文件
    :param img_path: 图像文件路径
    :type txt_name: 输出txt文件路径
    """

    #把图片转成二值图像，并设长宽均为32
    im = Image.open(img_path).convert('1').resize((32, 32))  # type:Image.Image
    #plt.imshow(im)
    #plt.show()
    
    #将上面得到的图像转成array数组
    data = np.asarray(im)
    #将上面得到的数组保存在到文本文件中，指定存储数据类型为整型，分隔符
    np.savetxt(txt_name, data, fmt='%d', delimiter='')