|
马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
本帖最后由 微言大义 于 2018-4-10 10:43 编辑
Python小白,照着《机器学习实战》敲了k-近邻算法的代码,添加了一些注释,完整的代码可以运行,背景环境还需要看书
- import numpy as np
- import operator
- import matplotlib
- import matplotlib.pyplot as plt
- from pylab import mpl
- def file2matrix(filename):
- fr = open(filename)
- numberOfLines = len(fr.readlines()) #得到文件行数
- returnMat = np.zeros((numberOfLines,3)) #创建以0填充的矩阵Numpy,将另一维度设置为固定的3
- classLabelVector = []
- fr = open(filename)
- index = 0
- for line in fr.readlines():
- line = line.strip() #截取掉所有的回车字符
- listFromLine = line.split('\t') #使用tab字符\t将上一步得到的整行数据分割成一个元素列表
- returnMat[index,:] = listFromLine[0:3] #选取前三个元素,存储到特征矩阵中
- classLabelVector.append(int(listFromLine[-1])) #利用负索引将表的最后一列存储到 classLabelVector中,必须明确列表中存储的元素是整形,否则Python会将这些元素当做字符串处理
- index += 1
- return returnMat,classLabelVector
-
- #分析数据:使用Matplotlib创建散点图
- datingDataMat,datingLabels =file2matrix('datingTestSet2.txt')
- fig = plt.figure()
- ax = fig.add_subplot(111)
- ax.scatter(datingDataMat[:,1],datingDataMat[:,2],15.0*np.array(datingLabels), 15.0*np.array(datingLabels))
- mpl.rcParams['font.sans-serif'] = ['SimHei'] #将坐标轴标记转换为汉字
- plt.xlabel('玩视频游戏所耗时间比')
- plt.ylabel('每周消费的冰激凌公斤数')
- plt.show()
- fig = plt.figure(2)
- ax = fig.add_subplot(111)
- ax.scatter(datingDataMat[:,1],datingDataMat[:,0],15.0*np.array(datingLabels), 15.0*np.array(datingLabels))
- plt.xlabel('玩视频游戏所耗时间比')
- plt.ylabel('每年获取的飞行常客旅程数')
- plt.show()
- #归一化特征值
- def autoNorm(dataSet):
- minVals = dataSet.min(0) #将最小值放在minVal中,参数0使得函数可以从列中选取最小值,而不是从当前行的最小值
- maxVals = dataSet.max(0)
- ranges = maxVals-minVals #计算可能的取值范围
- normDataSet = np.zeros(np.shape(dataSet)) #创建新的返回矩阵
- m = dataSet.shape[0]
- normDataSet = dataSet - np.tile(minVals,(m,1)) #tile函数将变量内容复制成输入矩阵同样大小的矩阵
- normDataSet = normDataSet/np.tile(ranges,(m,1))
- return normDataSet,ranges,minVals #特征值相除
- #执行autoNorm函数,监测函数执行结果
- normMat,ranges,minVals = autoNorm(datingDataMat)
- def classify0(inX,dataSet,labels,k): #用于分类的输入向量inX,输入的训练样本集dataSet,标签向量labels,用于选择最近邻居数目的k值
- dataSetSize = dataSet.shape[0] #利用shape函数读取dataSet第一维度长度
- diffMat = np.tile(inX,(dataSetSize,1)) - dataSet #利用欧氏距离公式计算两向量点之间的距离
- sqDiffMat = diffMat ** 2
- sqDistance = sqDiffMat.sum(axis=1) #将矩阵的每一行向量相加求和
- distances = sqDistance**0.5
- sortedDistIndicies = distances.argsort() #返回数组值从小到大的索引值
- classCount = {} #建立空字典
- for i in range(k): #选择距离最小的k个点
- voteIlabel = labels[sortedDistIndicies[i]]
- classCount[voteIlabel] = classCount.get(voteIlabel,0) + 1 #将classCount字典分解成元组列表
- sortedClassCount = sorted(classCount.items(),key = operator.itemgetter(1),reverse = True) #导入运算符模块的itemgetter方法,按照第二个元素的次序对元组进行排序
- return sortedClassCount[0][0]
-
- #测试错误率
- def datingClassTest():
- hoRatio = 0.1
- datingDataMat,datingLabels = file2matrix('datingTestSet2.txt')
- normMat,ranges,minVals = autoNorm(datingDataMat)
- m = normMat.shape[0]
- numTestVecs = int(m*hoRatio)
- errorCount = 0.0
- for i in range(numTestVecs):
- classifierResult = classify0(normMat[i,:],normMat[numTestVecs:m,:],datingLabels[numTestVecs:m],3)
- print("the classifier came back with: %d, the real answer is: %d"% (classifierResult, datingLabels[i]))
- if (classifierResult != datingLabels[i]):
- errorCount += 1.0
- print ("the total error rate is: %f" % (errorCount/float(numTestVecs)))
- #约会网站预测函数,调用之前的函数,添加了输入函数,以便用户输入
- def classifyPerson():
- resultList = ['not at all','in small doses','in large doses']
- percentTats = float(input("percentage of time spent playing video games?"))
- ffMiles = float(input("frequent flier miles earned per year?"))
- iceCream = float(input("liters of ice cream consumed per year?"))
- datingDataMat,datingLabels = file2matrix('datingTestSet2.txt')
- normMat,ranges,minVals = autoNorm(datingDataMat)
- inArr = np.array([ffMiles,percentTats,iceCream])
- classifierResult = classify0((inArr-minVals)/ranges,normMat,datingLabels,3)
- print("you will probably like this person:",resultList[classifierResult - 1])
复制代码
运行程序,得散点图如下:
执行框输入classifyPerson()即可,运行结果如下:
percentage of time spent playing video games?10
frequent flier miles earned per year?10000
liters of ice cream consumed per year?0.5
you will probably like this person: in small doses
|
-
散点图
-
散点图
|