import numpy as np import matplotlib.pyplot as plt
#load data defloadDataSet(fileName): dataMat = [] fr = open(fileName) for line in fr.readlines(): #for each line curLine = line.strip().split('\t') fltLine = list(map(float,curLine)) dataMat.append(fltLine) return dataMat
#init K points randomly defrandCent(dataSet, k): n = np.shape(dataSet)[1] centroids = np.mat(np.zeros((k,n)))#create centroid mat for j in range(n):#create random cluster centers, within bounds of each dimension minJ = np.min(dataSet[:,j]) rangeJ = float(np.max(dataSet[:,j]) - minJ) centroids[:,j] = np.mat(minJ + rangeJ * np.random.rand(k,1)) return centroids
#K-均值算法: defkMeans(dataSet,k,distMeas=distEclud,createCent=randCent): #参数:dataset,num of cluster,distance func,initCen m=np.shape(dataSet)[0] clusterAssment=np.mat(np.zeros((m,2)))#store the result matrix,2 cols for index and error centroids=createCent(dataSet,k) clusterChanged=True while clusterChanged: clusterChanged=False for i in range(m):#for every points minDist = float('inf'); minIndex = -1#init for j in range(k):#for every k centers,find the nearest center distJI=distMeas(centroids[j,:],dataSet[i,:]) if distJI<minDist:#if distance is shorter than minDist minDist=distJI; minIndex=j# update distance and index(类别) if clusterAssment[i,0] != minIndex: clusterChanged = True #此处判断数据点所属类别与之前是否相同(是否变化,只要有一个点变化就重设为True,再次迭代) clusterAssment[i,:] = minIndex,minDist**2 #print(centroids) # update k center for cent in range(k): ptsInClust=dataSet[np.nonzero(clusterAssment[:,0].A==cent)[0]] centroids[cent,:] = np.mean(ptsInClust,axis=0) return centroids,clusterAssment
import numpy as np import matplotlib.pyplot as plt from sklearn.cluster import KMeans
defloadDataSet(fileName): dataMat = [] fr = open(fileName) for line in fr.readlines(): #for each line curLine = line.strip().split('\t') fltLine = list(map(float,curLine)) dataMat.append(fltLine) return dataMat
data = np.array(loadDataSet('testSet.txt'))
k = 4# 簇 fig = plt.figure() ax = fig.add_subplot(111)