K-Means Clustering Algorithm

ClusterAnalysisBelow is an R implementation of a k-means clustering algorithm written recently for recreational purposes.  The algorithm will accept an arbitrary bivariate data set, x, and any integer greater than 1 (the k means) as arguments.  The algorithm uses the classic optimizing mechanism:

  1. Begin by randomly choosing k centers among the n points.
  2. Group all points to the nearest of these randomly chosen centers.
  3. Find k new centers as the average of each of the partitions created in step 2.
  4. Repeat this process until stable.

This process will result in a partition of the original data set that minimizes the sum of square distances between the original n points and the final k means.  The code includes a sample data set with 4 obvious clusters.  This is only meant as an exercise in demonstrating how intuitive this algorithm actually is.  Please defer to kmeans() for all your actual k-means clustering needs.

#hypothetical bivariate data set with four apparent data clusters
set.seed(123)
x1<-runif(30,0,4); x2<-runif(30,0,4);data1<-cbind(x1,x2)
x1<-runif(30,5,10); x2<-runif(30,5,10); data2<-cbind(x1,x2)
x1<-runif(20,0,4); x2<-runif(20,5,10); data3<-cbind(x1,x2)
x1<-runif(10,5,10); x2<-runif(10,0,4); data4<-cbind(x1,x2)
x<-rbind(data1,data2,data3,data4)
#plot(x)

#########################################################
##################MYKMEANS()##########################
#The function 'mykmeans()' takes a bivariate dataset of arbitraty length
#and an integer, k, as arguments.  The function then produces a k-means
#cluster partition of the dataset and outputs the cluster assignments and 
#the cluster centers
#########################################################

mykmeans<-function(dataset,k){
dataset=x; n<-length(dataset)/2;
#select k initial centers from the dataset points
points<-matrix(0,k,2)
points_new<-matrix(0,k,2)
for(i in 1:k){
	z<-sample(1:n,1,replace=FALSE)
	points[i,]<-dataset[z,]
}
points_new<-points

#move the k initial centers into a row configuration called points_iteration
points_iteration<-matrix(0,1,2*k+1); i=1
while(i<=k){
		points_iteration[1,2*i-1]<-points[i,1]
		points_iteration[1,2*i]<-points[i,2]
		i=i+1
}
points_iteration[1,2*k+1]=10

#initialize all elements needed for the main while loop
points_test<-c(rep(1,2*k))
points_row<-c(rep(0,2*k))
points_row<-points_iteration
test_matrix<-cbind(x,matrix(0,n,k+1))
m=2

#####################WHILE LOOP##################

while(sum(abs(points-points_test))>0.000000001){

#Test all point in the dataset against the k points in the initial_points dataset.
for(i in 1:n){
	for(j in 1:k){
	test_matrix[i,j+2]<-sqrt((test_matrix[i,1]-points_row[1,2*j-1])^2
	           +(test_matrix[i,2]-points_row[1,2*j])^2)
	}
}

#Identify the shortest distance and categorize based on this value.
for(i in 1:n){
	for(j in 1:k){
		if(min(test_matrix[i,3:(k+2)])==test_matrix[i,j+2]){test_matrix[i,(k+3)]=j}
	}
}

#average over each the first two columns of 'test_matrix' by cluster assignment.
#and assign these points to the 'points' matrix.
for (i in 1:k){
	   points_new[i,1]<-mean(subset(test_matrix,test_matrix[,k+3]==i)[,1])
	   points_new[i,2]<-mean(subset(test_matrix,test_matrix[,k+3]==i)[,2])	
}

#assign these averages to the data set bottom row of 'points_iteration'
for(i in 1:k){
		points_row[1,2*i-1]<-points_new[i,1]
		points_row[1,2*i]<-points_new[i,2]
}

#calculate sum(abs(points-points_new)) and put this in the last
#column of points_iteration
points_row[1,2*k+1]<-sum(abs(points-points_new))

#Add the new points row to the bottom of the points iteration matrix
points_iteration<-rbind(points_iteration,points_row)

#Move the new points into the points test vector
points_test<-points
points<-points_new
}
#####################WHILE LOOP##################

#Identify the k centers from the iterations matrix
output<-matrix(0,k,2)
output_points<-subset(points_iteration[,1:(2*k)],
                  points_iteration[,2*k+1]<0.000001)
for(i in 1:k){
	output[i,1]<-output_points[1,2*i-1]
	output[i,2]<-output_points[1,2*i]
}

#Identify the clustering vector
cluster_vector<-test_matrix[,k+3]

#Output the items
cat("The ",k," cluster centers are: ","\n")
write.table(output,col.names=FALSE)
cat("The cluster assignments are: ",cluster_vector,"\n")

#Output a plot of the data and cluster assignments
plot(x,xlim=c(0,11),ylim=c(0,11),main="original data with cluster 
centers and cluster id")
par(new=T)
plot(output,col="red",xlim=c(0,11),ylim=c(0,11),pch="X",xlab="",ylab="")

}

mykmeans(x,4)

#The  4  cluster centers are:  
#"1" 7.56305967554605 7.59627604969622
#"2" 2.28960065136974 1.754563482292
#"3" 2.21102243796922 6.8037627490703
#"4" 7.20495334023144 2.42752894861624
#The cluster assignments are:  2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 
#2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3 3
#3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 4 4 4 4 4 4 4 4 4 4 

kmeans(x,4)
#Cluster means:
#       x1       x2
#1 7.563060 7.596276
#2 2.289601 1.754563
#3 2.211022 6.803763
#4 7.204953 2.427529
Advertisements

Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out / Change )

Twitter picture

You are commenting using your Twitter account. Log Out / Change )

Facebook photo

You are commenting using your Facebook account. Log Out / Change )

Google+ photo

You are commenting using your Google+ account. Log Out / Change )

Connecting to %s