

function [M, gnd, Ni, stopPartitioning] = dbkmeans(X, k, Nmin)

%
%[M, gnd, Ni] = dbkmeans(X, k, Nmin)
%
% This is an extension of k-means algorithm that uses a deterministic
% initialization and returns a partition where the clusters are balanced.
% The clusters are balanced in the sense that no cluster contains less than
% Nmin observations. For speed up, the fast pairwise distance
% computations in [1] and some functions of the fast matlab kmeans
% implementation (litekmeans) in [2] and  have been used.
%
%
% IN
%
% X: data set - F x N
%
% k: number of clusters
%
% Nmin: minimum allowed number of observations per cluster
%
%
% OUT
%
% M: cluster means - K x F
%
% gnd: ground truth cluster labels for the observations in X - N x 1
%
% Ni: number of observatins in i-th cluster
%
% stopPartitioning: 
%  true: a balanced partition is created
%  false: a balanced partition can not be created
%
%
% Related references:
%
% 1. Michael Chen, "sqdistance",
% http://www.mathworks.com/matlabcentral/fileexchange/24599-pairwise-distan
% ce-matrix/content/sqdistance.m, 
%
% 2. Michael Chen, "litekmeans", 
% http://www.mathworks.com/matlabcentral/fileexchange/24616-kmeans-clustering
% , 2012 
%
% Author: Nikolaos Gkalelis - CERTH-ITI
% Email: gkalelis@iti.gr
%
% Created 01 Aug 2013
%

%% initialize
maxIter = 50; % maximum number of iterations for producing a balanced partition (change if necessary !)
N = size(X,2);
lbl_prev = 0;

%% check if a balanced partition can be provided
stopPartitioning = false;
if floor(N/k) < Nmin
    fprintf('dbkmeans>> Cannot produce a balance partition of (%d) observations per cluster for (%d) total observations\n', Nmin, N);
    M = []; gnd = []; Ni = [];
    stopPartitioning = true;
    return;
end

%% determenistic initialization
lbl = zeros(1,N); 
step = ceil(N/k);
a = 1;
for i=1:k
    b = min(a + step-1, N);
    lbl(a:b) = i*ones(1,b-a+1);
    a = b+1;
end

%% partitioning algorithm
iter = 0;
unmatchedOld = 2*N; % initiallize matches
while any(lbl ~= lbl_prev) && iter < maxIter
    iter = iter +1;
    [u,tmp,lbl] = unique(lbl);   % remove empty clusters
    clear tmp;
    k = length(u);
    E = sparse(1:N,lbl,1,N,k,N);  % transform lbl into indicator matrix
    M = X*(E*spdiags(1./sum(E,1)',0,k,k));    % compute M of each cluster
    lbl_prev = lbl;
    [tmp,lbl] = max(bsxfun(@minus,M'*X,dot(M,M,1)'/2),[],1); % assign samples to the nearest centers
    clear tmp;
    
    Ni = zeros(k, 1); % compute number of samples per class
    for i=1:k
        Ni(i) = sum(lbl == i);
    end
    
    % check if cardinality of clusters is valid and add observations to 
    % invalid clusters by getting observations from valid one
    Nv = Ni < Nmin;
    if sum(Nv) ~= 0 % if any cluster is invalid 
        for i=1:k
            if Nv(i) == 1
                pd = sqdistance(M(:,i), X);
                [pdval, pdidx] = sort(pd);
                clear pdval;
                j = 0;
                while Ni(i) < Nmin
                    j = j + 1;
                    if j > length(lbl)
                        fprintf('dbkmeans>> exited length\n');
                    end
                    if lbl(pdidx(j)) ~= i && Ni(lbl(pdidx(j))) > Nmin
                        Ni(lbl(pdidx(j))) = Ni(lbl(pdidx(j))) -1;
                        lbl(pdidx(j)) = i; % flip the lbl of this sample                        
                        Ni(i) = Ni(i)+1;
                    end                    
                end
            end
        end
    end
    
    % keep partition if it provides the best match until now
    % a partition is always valid when we arrive here
    unmatchedNew = sum(lbl ~= lbl_prev);
    if unmatchedNew <= unmatchedOld
        unmatchedOld = unmatchedNew;
        labelopt = lbl;
        Mo = M;
        Niopt = Ni;
        %fprintf('dbkmeans>> Keeping Current optimum partition: Iter(%d), Unmatched:(%d)\n', iter, unmatchedNew);
    end
    
end

[tmpA,tmpB,gnd] = unique(labelopt); % remove empty clusters
clear tmpA tmpB;

%% return optimum partition
Ni = Niopt;
M = Mo';
gnd = gnd';

