2017-02-18 10:09:00      个评论

k均值聚类称为kmeans，是一种非监督学习的算法，下面写一下对监督学习和非监督学习的理解。

step1：

```#读取数据函数
datamat = []
f = open(filename)
t = line.strip().split('\t')
temp = map(float, t)
datamat.append(temp)
return datamat```
step2：

step3：

```#距离函数
def getDistance(vecA, vecB):
return sqrt(sum(power(vecA - vecB, 2)))

#kmeans随机选取质心
def randCent(dataset, k):
n = shape(dataset)[1]
cent = mat(zeros((k, n)))
for i in range(n):
minnum = dataset[ : , i].min()
rangenum = dataset[ : , i].max() - minnum
cent[ : , i] = minnum + rangenum * random.rand(k, 1)
return cent
```

step4：

kmeans聚类算法慢慢调整这些质心，直到质心所含数据不再改变为止。

```#keanms算法
def kMeans(data, k, distype = getDistance, centtype = randCent):
m = shape(data)[0]
dataset = mat(data)
cent = centtype(dataset, k)
disarr = mat(zeros((m, 2)))
centchange = True
while centchange:
centchange = False
for i in range(m):
mindis = inf
minindex = 0
for j in range(k):
dis = distype(dataset[i, : ], cent[j])
if dis < mindis:
mindis = dis
minindex = j
if minindex != disarr[i, 0]:
centchange = True
disarr[i] = minindex, mindis
for temp in range(k):
distancearr = dataset[nonzero(temp == disarr[ : , 0])[0]][0]
cent[temp, :] = mean(distancearr, axis = 0)
return cent, disarr```

step5：

```#二分kmeans算法
def biKMeans(dataset, k, distype = getDistance):
data = mat(dataset)
m = shape(data)[0]
disarr = mat(zeros((m, 2)))
firstcent = mean(data, axis = 0).tolist()[0]
cent = [firstcent]
for j in range(m):
disarr[j, 1] = distype(data[j], mat(cent[0])) ** 2
while (len(cent) < k):
bestdis = inf
for i in range(len(cent)):
temparr = data[nonzero(disarr[ : , 0].A == i)[0], :]
tempcent, tempdisarr = kMeans(temparr, 2, distype)
splitdis = sum(tempdisarr[ : , 1])
nosplitdis = sum(disarr[nonzero(disarr[ : , 0].A != i)[0], 1])
if (splitdis + nosplitdis) < bestdis:
bestsplit = i
bestcent = tempcent
bestarr = tempdisarr.copy()
bestdis = splitdis + nosplitdis
bestarr[nonzero(bestarr[ : , 0].A == 1)[0], 0] = len(cent)
bestarr[nonzero(bestarr[ : , 0].A == 0)[0], 0] = bestsplit
cent[bestsplit] = bestcent[0].A.tolist()[0]
cent.append(bestcent[1].A.tolist()[0])
disarr[nonzero(disarr[ : , 0].A == bestsplit)[0], :] = bestarr
print disarr
return cent, disarr
```

step6：

```#地球距离函数
def distSLC(vecA, vecB):#Spherical Law of Cosines
a = sin(vecA[0,1]*pi/180) * sin(vecB[0,1]*pi/180)
b = cos(vecA[0,1]*pi/180) * cos(vecB[0,1]*pi/180) * \
cos(pi * (vecB[0,0]-vecA[0,0]) /180)
return arccos(a + b)*6371.0 #pi is imported with numpy

#实例
def clusterClubs(numClust=5):
datList = []
lineArr = line.split('\t')
datList.append([float(lineArr[4]), float(lineArr[3])])
datMat = mat(datList)
myCentroids, clustAssing = biKMeans(datMat, numClust, distype=distSLC)
fig = plt.figure()
rect=[0.1,0.1,0.8,0.8]
scatterMarkers=['s', 'o', '^', '8', 'p', \
'd', 'v', 'h', '>', '<']
colortype = ['blue', 'green', 'yellow', 'black', 'm']
axprops = dict(xticks=[], yticks=[])
ax0.imshow(imgP)
for i in range(numClust):
ptsInCurrCluster = datMat[nonzero(clustAssing[:,0].A==i)[0],:]
markerStyle = scatterMarkers[i % len(scatterMarkers)]
ax1.scatter(ptsInCurrCluster[:,0].flatten().A[0], ptsInCurrCluster[:,1].flatten().A[0], marker=markerStyle, s=90,c = colortype[i%len(colortype)])
myCentroids = array(myCentroids)
ax1.scatter(myCentroids[:,0].flatten(), myCentroids[:,1].flatten(), marker='+', s=300, c = 'red')
plt.show()```