机器学习实战--利用k-近邻算法改进约会网站

微言大义 · 发表于 2018-4-10 09:57:23

马上注册，结交更多好友，享用更多功能^_^

您需要登录才可以下载或查看，没有账号？立即注册

x

本帖最后由微言大义于 2018-4-10 10:43 编辑

Python小白，照着《机器学习实战》敲了k-近邻算法的代码，添加了一些注释，完整的代码可以运行，背景环境还需要看书

import numpy as np
import operator
import matplotlib
import matplotlib.pyplot as plt
from pylab import mpl
def file2matrix(filename):
fr = open(filename)
numberOfLines = len(fr.readlines()) #得到文件行数
returnMat = np.zeros((numberOfLines,3)) #创建以0填充的矩阵Numpy,将另一维度设置为固定的3
classLabelVector = []
fr = open(filename)
index = 0
for line in fr.readlines():
line = line.strip() #截取掉所有的回车字符
listFromLine = line.split('\t') #使用tab字符\t将上一步得到的整行数据分割成一个元素列表
returnMat[index,:] = listFromLine[0:3] #选取前三个元素，存储到特征矩阵中
classLabelVector.append(int(listFromLine[-1])) #利用负索引将表的最后一列存储到 classLabelVector中，必须明确列表中存储的元素是整形，否则Python会将这些元素当做字符串处理
index += 1
return returnMat,classLabelVector
#分析数据：使用Matplotlib创建散点图
datingDataMat,datingLabels =file2matrix('datingTestSet2.txt')
fig = plt.figure()
ax = fig.add_subplot(111)
ax.scatter(datingDataMat[:,1],datingDataMat[:,2],15.0*np.array(datingLabels), 15.0*np.array(datingLabels))
mpl.rcParams['font.sans-serif'] = ['SimHei'] #将坐标轴标记转换为汉字
plt.xlabel('玩视频游戏所耗时间比')
plt.ylabel('每周消费的冰激凌公斤数')
plt.show()
fig = plt.figure(2)
ax = fig.add_subplot(111)
ax.scatter(datingDataMat[:,1],datingDataMat[:,0],15.0*np.array(datingLabels), 15.0*np.array(datingLabels))
plt.xlabel('玩视频游戏所耗时间比')
plt.ylabel('每年获取的飞行常客旅程数')
plt.show()
#归一化特征值
def autoNorm(dataSet):
minVals = dataSet.min(0) #将最小值放在minVal中，参数0使得函数可以从列中选取最小值，而不是从当前行的最小值
maxVals = dataSet.max(0)
ranges = maxVals-minVals #计算可能的取值范围
normDataSet = np.zeros(np.shape(dataSet)) #创建新的返回矩阵
m = dataSet.shape[0]
normDataSet = dataSet - np.tile(minVals,(m,1)) #tile函数将变量内容复制成输入矩阵同样大小的矩阵
normDataSet = normDataSet/np.tile(ranges,(m,1))
return normDataSet,ranges,minVals #特征值相除
#执行autoNorm函数，监测函数执行结果
normMat,ranges,minVals = autoNorm(datingDataMat)
def classify0(inX,dataSet,labels,k): #用于分类的输入向量inX，输入的训练样本集dataSet，标签向量labels，用于选择最近邻居数目的k值
dataSetSize = dataSet.shape[0] #利用shape函数读取dataSet第一维度长度
diffMat = np.tile(inX,(dataSetSize,1)) - dataSet #利用欧氏距离公式计算两向量点之间的距离
sqDiffMat = diffMat ** 2
sqDistance = sqDiffMat.sum(axis=1) #将矩阵的每一行向量相加求和
distances = sqDistance**0.5
sortedDistIndicies = distances.argsort() #返回数组值从小到大的索引值
classCount = {} #建立空字典
for i in range(k): #选择距离最小的k个点
voteIlabel = labels[sortedDistIndicies[i]]
classCount[voteIlabel] = classCount.get(voteIlabel,0) + 1 #将classCount字典分解成元组列表
sortedClassCount = sorted(classCount.items(),key = operator.itemgetter(1),reverse = True) #导入运算符模块的itemgetter方法，按照第二个元素的次序对元组进行排序
return sortedClassCount[0][0]
#测试错误率
def datingClassTest():
hoRatio = 0.1
datingDataMat,datingLabels = file2matrix('datingTestSet2.txt')
normMat,ranges,minVals = autoNorm(datingDataMat)
m = normMat.shape[0]
numTestVecs = int(m*hoRatio)
errorCount = 0.0
for i in range(numTestVecs):
classifierResult = classify0(normMat[i,:],normMat[numTestVecs:m,:],datingLabels[numTestVecs:m],3)
print("the classifier came back with: %d, the real answer is: %d"% (classifierResult, datingLabels[i]))
if (classifierResult != datingLabels[i]):
errorCount += 1.0
print ("the total error rate is: %f" % (errorCount/float(numTestVecs)))
#约会网站预测函数，调用之前的函数，添加了输入函数，以便用户输入
def classifyPerson():
resultList = ['not at all','in small doses','in large doses']
percentTats = float(input("percentage of time spent playing video games?"))
ffMiles = float(input("frequent flier miles earned per year?"))
iceCream = float(input("liters of ice cream consumed per year?"))
datingDataMat,datingLabels = file2matrix('datingTestSet2.txt')
normMat,ranges,minVals = autoNorm(datingDataMat)
inArr = np.array([ffMiles,percentTats,iceCream])
classifierResult = classify0((inArr-minVals)/ranges,normMat,datingLabels,3)
print("you will probably like this person:",resultList[classifierResult - 1])

复制代码

运行程序，得散点图如下：

执行框输入classifyPerson()即可，运行结果如下：
percentage of time spent playing video games?10

frequent flier miles earned per year?10000

liters of ice cream consumed per year?0.5

you will probably like this person: in small doses

账号		自动登录	找回密码
密码			立即注册

[技术交流] 机器学习实战--利用k-近邻算法改进约会网站

马上注册，结交更多好友，享用更多功能^_^