Orfeo Toolbox  3.16
KdTreeBasedKMeansClustering.cxx
Go to the documentation of this file.
1 /*=========================================================================
2 
3  Program: ORFEO Toolbox
4  Language: C++
5  Date: $Date$
6  Version: $Revision$
7 
8 
9  Copyright (c) Centre National d'Etudes Spatiales. All rights reserved.
10  See OTBCopyright.txt for details.
11 
12 
13  This software is distributed WITHOUT ANY WARRANTY; without even
14  the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15  PURPOSE. See the above copyright notices for more information.
16 
17 =========================================================================*/
18 
19 
20 // Software Guide : BeginLatex
21 // \index{Statistics!k-means clustering (using k-d tree)}
22 //
23 // \index{itk::Statistics::KdTree\-Based\-Kmeans\-Estimator}
24 //
25 // K-means clustering is a popular clustering algorithm because it is simple
26 // and usually converges to a reasonable solution. The k-means algorithm
27 // works as follows:
28 //
29 // \begin{enumerate}
30 // \item{Obtains the initial k means input from the user.}
31 // \item{Assigns each measurement vector in a sample container to its
32 // closest mean among the k number of means (i.e., update the membership of
33 // each measurement vectors to the nearest of the k clusters).}
34 // \item{Calculates each cluster's mean from the newly assigned
35 // measurement vectors (updates the centroid (mean) of k clusters).}
36 // \item{Repeats step 2 and step 3 until it meets the termination
37 // criteria.}
38 // \end{enumerate}
39 //
40 // The most common termination criteria is that if there is no
41 // measurement vector that changes its cluster membership from the
42 // previous iteration, then the algorithm stops.
43 //
44 // The \subdoxygen{itk}{Statistics}{KdTreeBasedKmeansEstimator} is a variation of
45 // this logic. The k-means clustering algorithm is computationally very
46 // expensive because it has to recalculate the mean at each iteration. To
47 // update the mean values, we have to calculate the distance between k means
48 // and each and every measurement vector. To reduce the computational burden,
49 // the KdTreeBasedKmeansEstimator uses a special data structure: the
50 // k-d tree (\subdoxygen{itk}{Statistics}{KdTree}) with additional
51 // information. The additional information includes the number and the vector
52 // sum of measurement vectors under each node under the tree architecture.
53 //
54 // With such additional information and the k-d tree data structure,
55 // we can reduce the computational cost of the distance calculation
56 // and means. Instead of calculating each measurement vectors and k
57 // means, we can simply compare each node of the k-d tree and the k
58 // means. This idea of utilizing a k-d tree can be found in multiple
59 // articles \cite{Alsabti1998} \cite{Pelleg1999}
60 // \cite{Kanungo2000}. Our implementation of this scheme follows the
61 // article by the Kanungo et al \cite{Kanungo2000}.
62 //
63 // We use the \subdoxygen{itk}{Statistics}{ListSample} as the input sample, the
64 // \doxygen{itk}{Vector} as the measurement vector. The following code
65 // snippet includes their header files.
66 //
67 // Software Guide : EndLatex
68 
69 // Software Guide : BeginCodeSnippet
70 #include "itkVector.h"
71 #include "itkListSample.h"
72 // Software Guide : EndCodeSnippet
73 
74 // Software Guide : BeginLatex
75 //
76 // Since this k-means algorithm requires a \subdoxygen{itk}{Statistics}{KdTree}
77 // object as an input, we include the KdTree class header file. As mentioned
78 // above, we need a k-d tree with the vector sum and the number of
79 // measurement vectors. Therefore we use the
80 // \subdoxygen{itk}{Statistics}{WeightedCentroidKdTreeGenerator} instead of the
81 // \subdoxygen{itk}{Statistics}{KdTreeGenerator} that generate a k-d tree without
82 // such additional information.
83 //
84 // Software Guide : EndLatex
85 
86 // Software Guide : BeginCodeSnippet
87 #include "itkKdTree.h"
89 // Software Guide : EndCodeSnippet
90 
91 // Software Guide : BeginLatex
92 //
93 // The KdTreeBasedKmeansEstimator class is the implementation of the
94 // k-means algorithm. It does not create k clusters. Instead, it
95 // returns the mean estimates for the k clusters.
96 //
97 // Software Guide : EndLatex
98 
99 // Software Guide : BeginCodeSnippet
101 // Software Guide : EndCodeSnippet
102 
103 // Software Guide : BeginLatex
104 //
105 // To generate the clusters, we must create k instances of
106 // \subdoxygen{itk}{Statistics}{EuclideanDistance} function as the membership
107 // functions for each cluster and plug that---along with a sample---into an
108 // \subdoxygen{itk}{Statistics}{SampleClassifier} object to get a
109 // \subdoxygen{itk}{Statistics}{MembershipSample} that stores pairs of measurement
110 // vectors and their associated class labels (k labels).
111 //
112 // Software Guide : EndLatex
113 
114 // Software Guide : BeginCodeSnippet
115 #include "itkMinimumDecisionRule.h"
116 #include "itkEuclideanDistance.h"
117 #include "itkSampleClassifier.h"
118 // Software Guide : EndCodeSnippet
119 
120 // Software Guide : BeginLatex
121 //
122 // We will fill the sample with random variables from two normal
123 // distribution using the \subdoxygen{itk}{Statistics}{NormalVariateGenerator}.
124 //
125 // Software Guide : EndLatex
126 
127 // Software Guide : BeginCodeSnippet
129 // Software Guide : EndCodeSnippet
130 
131 int main()
132 {
133  // Software Guide : BeginLatex
134  //
135  // Since the \code{NormalVariateGenerator} class only supports 1-D, we
136  // define our measurement vector type as one component vector. We
137  // then, create a \code{ListSample} object for data inputs. Each
138  // measurement vector is of length 1. We set this using the
139  // \code{SetMeasurementVectorSize()} method.
140  // Software Guide : EndLatex
141 
142  // Software Guide : BeginCodeSnippet
143  typedef itk::Vector<double,
144  1>
145  MeasurementVectorType;
147  SampleType::Pointer sample = SampleType::New();
148  sample->SetMeasurementVectorSize(1);
149  // Software Guide : EndCodeSnippet
150 
151  // Software Guide : BeginLatex
152  //
153  // The following code snippet creates a NormalVariateGenerator
154  // object. Since the random variable generator returns values
155  // according to the standard normal distribution (The mean is zero,
156  // and the standard deviation is one), before pushing random values
157  // into the \code{sample}, we change the mean and standard
158  // deviation. We want two normal (Gaussian) distribution data. We have
159  // two for loops. Each \code{for} loop uses different mean and standard
160  // deviation. Before we fill the \code{sample} with the second
161  // distribution data, we call \code{Initialize(random seed)} method,
162  // to recreate the pool of random variables in the
163  // \code{normalGenerator}.
164  //
165  // To see the probability density plots from the two distribution,
166  // refer to the Figure~\ref{fig:TwoNormalDensityFunctionPlot}.
167  //
168  // \begin{figure}
169  // \center
170  // \includegraphics[width=0.8\textwidth]{TwoNormalDensityFunctionPlot.eps}
171  // \itkcaption[Two normal distributions plot]{Two normal distributions' probability density plot
172  // (The means are 100 and 200, and the standard deviation is 30 )}
173  // \protect\label{fig:TwoNormalDensityFunctionPlot}
174  // \end{figure}
175  //
176  // Software Guide : EndLatex
177 
178  // Software Guide : BeginCodeSnippet
179  typedef itk::Statistics::NormalVariateGenerator NormalGeneratorType;
180  NormalGeneratorType::Pointer normalGenerator = NormalGeneratorType::New();
181 
182  normalGenerator->Initialize(101);
183 
184  MeasurementVectorType mv;
185  double mean = 100;
186  double standardDeviation = 30;
187  for (unsigned int i = 0; i < 100; ++i)
188  {
189  mv[0] = (normalGenerator->GetVariate() * standardDeviation) + mean;
190  sample->PushBack(mv);
191  }
192 
193  normalGenerator->Initialize(3024);
194  mean = 200;
195  standardDeviation = 30;
196  for (unsigned int i = 0; i < 100; ++i)
197  {
198  mv[0] = (normalGenerator->GetVariate() * standardDeviation) + mean;
199  sample->PushBack(mv);
200  }
201  // Software Guide : EndCodeSnippet
202 
203  // Software Guide : BeginLatex
204  //
205  // We create a k-d tree. %To see the details on the k-d tree generation, see
206  // %the Section~\ref{sec:KdTree}.
207  //
208  // Software Guide : EndLatex
209 
210  // Software Guide : BeginCodeSnippet
212  TreeGeneratorType;
213  TreeGeneratorType::Pointer treeGenerator = TreeGeneratorType::New();
214 
215  treeGenerator->SetSample(sample);
216  treeGenerator->SetBucketSize(16);
217  treeGenerator->Update();
218  // Software Guide : EndCodeSnippet
219 
220  // Software Guide : BeginLatex
221  //
222  // Once we have the k-d tree, it is a simple procedure to produce k
223  // mean estimates.
224  //
225  // We create the KdTreeBasedKmeansEstimator. Then, we provide the initial
226  // mean values using the \code{SetParameters()}. Since we are dealing with
227  // two normal distribution in a 1-D space, the size of the mean value array
228  // is two. The first element is the first mean value, and the second is the
229  // second mean value. If we used two normal distributions in a 2-D space,
230  // the size of array would be four, and the first two elements would be the
231  // two components of the first normal distribution's mean vector. We
232  // plug-in the k-d tree using the \code{SetKdTree()}.
233  //
234  // The remaining two methods specify the termination condition. The
235  // estimation process stops when the number of iterations reaches the
236  // maximum iteration value set by the \code{SetMaximumIteration()}, or the
237  // distances between the newly calculated mean (centroid) values and
238  // previous ones are within the threshold set by the
239  // \code{SetCentroidPositionChangesThreshold()}. The final step is
240  // to call the \code{StartOptimization()} method.
241  //
242  // The for loop will print out the mean estimates from the estimation
243  // process.
244  //
245  // Software Guide : EndLatex
246 
247  // Software Guide : BeginCodeSnippet
248  typedef TreeGeneratorType::KdTreeType TreeType;
250  EstimatorType::Pointer estimator = EstimatorType::New();
251 
252  EstimatorType::ParametersType initialMeans(2);
253  initialMeans[0] = 0.0;
254  initialMeans[1] = 0.0;
255 
256  estimator->SetParameters(initialMeans);
257  estimator->SetKdTree(treeGenerator->GetOutput());
258  estimator->SetMaximumIteration(200);
259  estimator->SetCentroidPositionChangesThreshold(0.0);
260  estimator->StartOptimization();
261 
262  EstimatorType::ParametersType estimatedMeans = estimator->GetParameters();
263 
264  for (unsigned int i = 0; i < 2; ++i)
265  {
266  std::cout << "cluster[" << i << "] " << std::endl;
267  std::cout << " estimated mean : " << estimatedMeans[i] << std::endl;
268  }
269  // Software Guide : EndCodeSnippet
270 
271  // Software Guide : BeginLatex
272  //
273  // If we are only interested in finding the mean estimates, we might
274  // stop. However, to illustrate how a classifier can be formed using
275  // the statistical classification framework. We go a little bit
276  // further in this example.
277  //
278  // Since the k-means algorithm is an minimum distance classifier using
279  // the estimated k means and the measurement vectors. We use the
280  // EuclideanDistance class as membership functions. Our choice
281  // for the decision rule is the
282  // \subdoxygen{itk}{Statistics}{MinimumDecisionRule} that returns the
283  // index of the membership functions that have the smallest value for
284  // a measurement vector.
285  //
286  // After creating a SampleClassifier object and a MinimumDecisionRule
287  // object, we plug-in the \code{decisionRule} and the \code{sample} to the
288  // classifier. Then, we must specify the number of classes that will be
289  // considered using the \code{SetNumberOfClasses()} method.
290  //
291  // The remainder of the following code snippet shows how to use
292  // user-specified class labels. The classification result will be stored in
293  // a MembershipSample object, and for each measurement vector, its
294  // class label will be one of the two class labels, 100 and 200
295  // (\code{unsigned int}).
296  //
297  // Software Guide : EndLatex
298 
299  // Software Guide : BeginCodeSnippet
301  <MeasurementVectorType> MembershipFunctionType;
302  typedef itk::MinimumDecisionRule DecisionRuleType;
303  DecisionRuleType::Pointer decisionRule = DecisionRuleType::New();
304 
305  typedef itk::Statistics::SampleClassifier<SampleType> ClassifierType;
306  ClassifierType::Pointer classifier = ClassifierType::New();
307 
308  classifier->SetDecisionRule((itk::DecisionRuleBase::Pointer) decisionRule);
309  classifier->SetSample(sample);
310  classifier->SetNumberOfClasses(2);
311 
312  std::vector<unsigned int> classLabels;
313  classLabels.resize(2);
314  classLabels[0] = 100;
315  classLabels[1] = 200;
316 
317  classifier->SetMembershipFunctionClassLabels(classLabels);
318  // Software Guide : EndCodeSnippet
319 
320  // Software Guide : BeginLatex
321  //
322  // The \code{classifier} is almost ready to do the classification
323  // process except that it needs two membership functions that
324  // represents two clusters respectively.
325  //
326  // In this example, the two clusters are modeled by two Euclidean distance
327  // functions. The distance function (model) has only one parameter, its mean
328  // (centroid) set by the \code{SetOrigin()} method. To plug-in two distance
329  // functions, we call the \code{AddMembershipFunction()} method. Then
330  // invocation of the \code{Update()} method will perform the
331  // classification.
332  //
333  // Software Guide : EndLatex
334 
335  // Software Guide : BeginCodeSnippet
336  std::vector<MembershipFunctionType::Pointer> membershipFunctions;
337 
338  MembershipFunctionType::OriginType origin(
339  sample->GetMeasurementVectorSize());
340  int index = 0;
341  for (unsigned int i = 0; i < 2; ++i)
342  {
343  membershipFunctions.push_back(MembershipFunctionType::New());
344  for (unsigned int j = 0; j < sample->GetMeasurementVectorSize(); ++j)
345  {
346  origin[j] = estimatedMeans[index++];
347  }
348  membershipFunctions[i]->SetOrigin(origin);
349  classifier->AddMembershipFunction(membershipFunctions[i].GetPointer());
350  }
351 
352  classifier->Update();
353  // Software Guide : EndCodeSnippet
354 
355  // Software Guide : BeginLatex
356  //
357  // The following code snippet prints out the measurement vectors and
358  // their class labels in the \code{sample}.
359  //
360  // Software Guide : EndLatex
361 
362  // Software Guide : BeginCodeSnippet
363  ClassifierType::OutputType* membershipSample =
364  classifier->GetOutput();
365  ClassifierType::OutputType::ConstIterator iter = membershipSample->Begin();
366 
367  while (iter != membershipSample->End())
368  {
369  std::cout << "measurement vector = " << iter.GetMeasurementVector()
370  << "class label = " << iter.GetClassLabel()
371  << std::endl;
372  ++iter;
373  }
374  // Software Guide : EndCodeSnippet
375 
376  return EXIT_SUCCESS;
377 }

Generated at Sun Feb 3 2013 00:15:36 for Orfeo Toolbox with doxygen 1.8.1.1