Below is an R implementation of a k-means clustering algorithm written recently for recreational purposes. The algorithm will accept an arbitrary bivariate data set, x, and any integer greater than 1 (the k means) as arguments. The algorithm uses the classic optimizing mechanism:
- Begin by randomly choosing k centers among the n points.
- Group all points to the nearest of these randomly chosen centers.
- Find k new centers as the average of each of the partitions created in step 2.
- Repeat this process until stable.
This process will result in a partition of the original data set that minimizes the sum of square distances between the original n points and the final k means. The code includes a sample data set with 4 obvious clusters. This is only meant as an exercise in demonstrating how intuitive this algorithm actually is. Please defer to kmeans() for all your actual k-means clustering needs.
#hypothetical bivariate data set with four apparent data clusters set.seed(123) x1<-runif(30,0,4); x2<-runif(30,0,4);data1<-cbind(x1,x2) x1<-runif(30,5,10); x2<-runif(30,5,10); data2<-cbind(x1,x2) x1<-runif(20,0,4); x2<-runif(20,5,10); data3<-cbind(x1,x2) x1<-runif(10,5,10); x2<-runif(10,0,4); data4<-cbind(x1,x2) x<-rbind(data1,data2,data3,data4) #plot(x) ######################################################### ##################MYKMEANS()########################## #The function 'mykmeans()' takes a bivariate dataset of arbitraty length #and an integer, k, as arguments. The function then produces a k-means #cluster partition of the dataset and outputs the cluster assignments and #the cluster centers ######################################################### mykmeans<-function(dataset,k){ dataset=x; n<-length(dataset)/2; #select k initial centers from the dataset points points<-matrix(0,k,2) points_new<-matrix(0,k,2) for(i in 1:k){ z<-sample(1:n,1,replace=FALSE) points[i,]<-dataset[z,] } points_new<-points #move the k initial centers into a row configuration called points_iteration points_iteration<-matrix(0,1,2*k+1); i=1 while(i<=k){ points_iteration[1,2*i-1]<-points[i,1] points_iteration[1,2*i]<-points[i,2] i=i+1 } points_iteration[1,2*k+1]=10 #initialize all elements needed for the main while loop points_test<-c(rep(1,2*k)) points_row<-c(rep(0,2*k)) points_row<-points_iteration test_matrix<-cbind(x,matrix(0,n,k+1)) m=2 #####################WHILE LOOP################## while(sum(abs(points-points_test))>0.000000001){ #Test all point in the dataset against the k points in the initial_points dataset. for(i in 1:n){ for(j in 1:k){ test_matrix[i,j+2]<-sqrt((test_matrix[i,1]-points_row[1,2*j-1])^2 +(test_matrix[i,2]-points_row[1,2*j])^2) } } #Identify the shortest distance and categorize based on this value. for(i in 1:n){ for(j in 1:k){ if(min(test_matrix[i,3:(k+2)])==test_matrix[i,j+2]){test_matrix[i,(k+3)]=j} } } #average over each the first two columns of 'test_matrix' by cluster assignment. #and assign these points to the 'points' matrix. for (i in 1:k){ points_new[i,1]<-mean(subset(test_matrix,test_matrix[,k+3]==i)[,1]) points_new[i,2]<-mean(subset(test_matrix,test_matrix[,k+3]==i)[,2]) } #assign these averages to the data set bottom row of 'points_iteration' for(i in 1:k){ points_row[1,2*i-1]<-points_new[i,1] points_row[1,2*i]<-points_new[i,2] } #calculate sum(abs(points-points_new)) and put this in the last #column of points_iteration points_row[1,2*k+1]<-sum(abs(points-points_new)) #Add the new points row to the bottom of the points iteration matrix points_iteration<-rbind(points_iteration,points_row) #Move the new points into the points test vector points_test<-points points<-points_new } #####################WHILE LOOP################## #Identify the k centers from the iterations matrix output<-matrix(0,k,2) output_points<-subset(points_iteration[,1:(2*k)], points_iteration[,2*k+1]<0.000001) for(i in 1:k){ output[i,1]<-output_points[1,2*i-1] output[i,2]<-output_points[1,2*i] } #Identify the clustering vector cluster_vector<-test_matrix[,k+3] #Output the items cat("The ",k," cluster centers are: ","\n") write.table(output,col.names=FALSE) cat("The cluster assignments are: ",cluster_vector,"\n") #Output a plot of the data and cluster assignments plot(x,xlim=c(0,11),ylim=c(0,11),main="original data with cluster centers and cluster id") par(new=T) plot(output,col="red",xlim=c(0,11),ylim=c(0,11),pch="X",xlab="",ylab="") } mykmeans(x,4) #The 4 cluster centers are: #"1" 7.56305967554605 7.59627604969622 #"2" 2.28960065136974 1.754563482292 #"3" 2.21102243796922 6.8037627490703 #"4" 7.20495334023144 2.42752894861624 #The cluster assignments are: 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 #2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3 3 #3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 4 4 4 4 4 4 4 4 4 4 kmeans(x,4) #Cluster means: # x1 x2 #1 7.563060 7.596276 #2 2.289601 1.754563 #3 2.211022 6.803763 #4 7.204953 2.427529