%% clustering
% Introduction of various clustering techniques.
%
% and
% should be in the path
%
% .
% See http://37steps.com/prtools for more.
%%
%
prwaitbar off % waitbar not needed here
delfigs % delete existing figures
randreset; % takes care of reproducability
prwarning off % no warnings
%% Define a dataset
% we use some standard routines to create 8 two-dimensional clusters.
% After creation the label information is removed.
randreset;
m = 50;
a = prdataset(rand(m,2).*repmat([8,0.5],m,1))+repmat([1 -1],m,1);
a = [a; prdataset(rand(m,2).*repmat([8,0.5],m,1)+repmat([1.5 0.5],m,1))];
x = +gendats([m m],2,4).*repmat([1,1],2*m,1)+repmat([3 5],2*m,1);
a = [a; x];
y = +[gendatc(m); gendatc(m)+repmat([0 10],m,1)];
y = y.*repmat([0.2,0.35],2*m,1)+repmat([-1.5 1.5],2*m,1);
a = [a; y];
z = prdataset(+[gencirc(m,0.05)*7; gencirc(m,0.04)*2.5]);
z = z.*repmat([0.5,0.5],2*m,1)+repmat([16 3],2*m,1);
a = [a; z];
scattern(prdataset(a,genlab(m*ones(1,8))));
axis equal;
title('Original dataset');
a = prdataset(+a); % remove labels
%%
% This dataset consist of:
%
% * two straight, noisy lines
% * two noisy circles inside each other
% * two spherical normal distributions
% * two elongated non-normal distributions.
%% partitional clustering by kmeans, find 8 clusters
figure;
[labs,b] = prkmeans(a,8,1,'rand');
subplot(2,2,1); scattern(b); axis equal
title('kmeans, iteration 1');
% The cluster labels are stored in labs. The dataset b contains the
% original objects, but labeled with the cluster labels after 1 iteration.
it = [2 2 100];
for n=1:3
% Here the results after more iterations are computed. Each call is
% initialised by the cluster labeling resulting from the previous call.
[labs,b] = prkmeans(a,8,it(n),labs);
% Note that for n=3 100 iterations are requested. The algorithm, however,
% stops much earlier, when the result is stable.
subplot(2,2,n+1); scattern(b); axis equal
title(['kmeans, iteration ' num2str(it(n)+1)]);
end
title('kmeans, final result');
%%
% The figure illustrates that the kmeans algorithm creates spherical
% clusters with the same radius.
%% hierarchical clustering, dendrograms
% The two most extreme versions are shown, single linkage and complete
% linkage. In this section the densrograms are computed.
figure;
d = sqrt(distm(a)); % The routines operate on the distance matrices
den = hclust(d,'single');
subplot(2,1,1); plotdg(den);
title('single linkage dendrogram');
set(gca,'xtick',[]); % removes the cluttered x-ticks
den = hclust(d,'complete');
subplot(2,1,2); plotdg(den);
title('complete linkage dendrogram');
set(gca,'xtick',[]); % removes the cluttered x-ticks
fontsize(14) % default font size is not nice
%%
% Dendrograms show in the horizontal direction the objects in some order.
% In the vertical direction the cluster distance is shown when two clusters
% are merged using the single linkage or complete linkage definition.
%
% The two dendrograms show the very different characteristics of the two
% procedures. By single linkage clusters grow gradually with more remotely
% located objects until they are merged with another cluster.
%
% By complete linkage clusters of similar size (largest within distance) are
% created.
%% hierarchical clustering, 4 clusters
figure;
labs = hclust(d,'single',4);
subplot(2,1,1); scattern(prdataset(a,labs));
axis equal; title('single linkage,4 clusters');
labs = hclust(d,'complete',4);
subplot(2,1,2); scattern(prdataset(a,labs));
axis equal; title('complete linkage, 4 clusters');
%%
% Here the resulting clusterings show their characteristing: elongated
% clusters constituted by touching (single linkage) or spherically shaped
% clusters neglecting touching (complete linkage). Note that single linkage
% shows one cluster of a single object. This can be verified in the
% dendrogram.
%% hierarchical clustering, 8 clusters
r = [1 3 5 7 2 4 6 8]'; % trick to shuffle labels for better visualization.
figure;
labs = hclust(d,'single',8);
subplot(2,1,1); scattern(prdataset(a,r(labs)));
axis equal; title('single linkage, 8 clusters');
labs = hclust(d,'complete',8);
subplot(2,1,2); scattern(prdataset(a,r(labs)));
axis equal; title('complete linkage, 8 clusters');
%%
% Again: elongated clusters for single linkage and spherical clusters for
% complete linkage. Single linkage has three single object clusters.
%% mode seeking
% This procedures searches the modes (local maxima) of the density. A
% nearest neighbor search is used to follow from each object the density
% gradient to a mode. The number of neighbors used for the search
% influences the number of clusters found: the more neighbors the less
% clusters.
figure;
labs = modeseek(d,20);
subplot(2,1,1); scattern(prdataset(a,labs));
axis equal; title(['mode seeking, 20 neighbors, ' num2str(max(labs)) ' clusters']);
labs = modeseek(d,10);
subplot(2,1,2); scattern(prdataset(a,labs));
axis equal; title(['mode seeking, 10 neighbors, ' num2str(max(labs)) ' clusters']);
%%
% For larger numbers of neighbors, clusters are combined as long as objects
% have neighbors in the other cluster having a higher density.