OTB  9.0.0
Orfeo Toolbox
otbSampleAugmentation.h
Go to the documentation of this file.
1 /*
2  * Copyright (C) 2005-2022 Centre National d'Etudes Spatiales (CNES)
3  *
4  * This file is part of Orfeo Toolbox
5  *
6  * https://www.orfeo-toolbox.org/
7  *
8  * Licensed under the Apache License, Version 2.0 (the "License");
9  * you may not use this file except in compliance with the License.
10  * You may obtain a copy of the License at
11  *
12  * http://www.apache.org/licenses/LICENSE-2.0
13  *
14  * Unless required by applicable law or agreed to in writing, software
15  * distributed under the License is distributed on an "AS IS" BASIS,
16  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17  * See the License for the specific language governing permissions and
18  * limitations under the License.
19  */
20 
21 #ifndef otbSampleAugmentation_h
22 #define otbSampleAugmentation_h
23 
24 #ifdef _OPENMP
25 #include <omp.h>
26 #endif
27 
28 #include <vector>
29 #include <queue>
30 #include <iterator>
31 #include <algorithm>
32 #include <random>
33 #include <ctime>
34 #include <cassert>
35 
36 namespace otb
37 {
38 
39 namespace sampleAugmentation
40 {
41 using SampleType = std::vector<double>;
42 using SampleVectorType = std::vector<SampleType>;
43 
49 {
50  const auto nbSamples = samples.size();
51  const long nbComponents = static_cast<long>(samples[0].size());
52  SampleType stds(nbComponents, 0.0);
53  SampleType means(nbComponents, 0.0);
54  for (size_t i = 0; i < nbSamples; ++i)
55  {
56  auto norm_factor = 1.0 / (i + 1);
57 #ifdef _OPENMP
58 #pragma omp parallel for
59 #endif
60  for (long j = 0; j < nbComponents; ++j)
61  {
62  const auto mu = means[j];
63  const auto x = samples[i][j];
64  auto muNew = mu + (x - mu) * norm_factor;
65  stds[j] += (x - mu) * (x - muNew);
66  means[j] = muNew;
67  }
68  }
69 #ifdef _OPENMP
70 #pragma omp parallel for
71 #endif
72  for (long j = 0; j < nbComponents; ++j)
73  {
74  stds[j] = std::sqrt(stds[j] / nbSamples);
75  }
76  return stds;
77 }
79 
84 void ReplicateSamples(const SampleVectorType& inSamples, const size_t nbSamples, SampleVectorType& newSamples)
85 {
86  newSamples.resize(nbSamples);
87  const long long nbSamplesLL = static_cast<long long>(nbSamples);
88  size_t imod{0};
89 #ifdef _OPENMP
90 #pragma omp parallel for
91 #endif
92  for (long long i = 0; i < nbSamplesLL; ++i)
93  {
94  if (imod == inSamples.size())
95  imod = 0;
96  newSamples[i] = inSamples[imod++];
97  }
98 }
100 
107 void JitterSamples(const SampleVectorType& inSamples, const size_t nbSamples, SampleVectorType& newSamples, float stdFactor = 10,
108  const int seed = std::time(nullptr))
109 {
110  newSamples.resize(nbSamples);
111  const long nbComponents = static_cast<long>(inSamples[0].size());
112  std::random_device rd;
113  std::mt19937 gen(rd());
114  // The input samples are selected randomly with replacement
115  std::srand(seed);
116  // We use one gaussian distribution per component since they may
117  // have different stds
118  auto stds = EstimateStds(inSamples);
119  std::vector<std::normal_distribution<double>> gaussDis(nbComponents);
120 #ifdef _OPENMP
121 #pragma omp parallel for
122 #endif
123  for (long i = 0; i < nbComponents; ++i)
124  gaussDis[i] = std::normal_distribution<double>{0.0, stds[i] / stdFactor};
126 
127  for (size_t i = 0; i < nbSamples; ++i)
128  {
129  newSamples[i] = inSamples[std::rand() % inSamples.size()];
130 #ifdef _OPENMP
131 #pragma omp parallel for
132 #endif
133  for (long j = 0; j < nbComponents; ++j)
134  newSamples[i][j] += gaussDis[j](gen);
135  }
136 }
137 
138 
140 {
141  size_t index;
142  double distance;
143 };
144 
146 {
147  constexpr bool operator()(const NeighborType& a, const NeighborType& b) const
148  {
149  return b.distance > a.distance;
150  }
151 };
152 
153 double ComputeSquareDistance(const SampleType& x, const SampleType& y)
154 {
155  assert(x.size() == y.size());
156  double dist{0};
157  for (size_t i = 0; i < x.size(); ++i)
158  {
159  dist += (x[i] - y[i]) * (x[i] - y[i]);
160  }
161  return dist / (x.size() * x.size());
162 }
163 
164 using NNIndicesType = std::vector<NeighborType>;
165 using NNVectorType = std::vector<NNIndicesType>;
168 void FindKNNIndices(const SampleVectorType& inSamples, const size_t nbNeighbors, NNVectorType& nnVector)
169 {
170  const long long nbSamples = static_cast<long long>(inSamples.size());
171  nnVector.resize(nbSamples);
172 #ifdef _OPENMP
173 #pragma omp parallel for
174 #endif
175  for (long long sampleIdx = 0; sampleIdx < nbSamples; ++sampleIdx)
176  {
177  std::priority_queue<NeighborType, NNIndicesType, NeighborSorter> nns;
178  for (long long neighborIdx = 0; neighborIdx < nbSamples; ++neighborIdx)
179  {
180  if (sampleIdx != neighborIdx)
181  {
182  nns.push({static_cast<size_t>(neighborIdx), ComputeSquareDistance(inSamples[sampleIdx], inSamples[neighborIdx])});
183  if (nns.size() > nbNeighbors)
184  {
185  nns.pop();
186  }
187  }
188  }
190 
191  NNIndicesType nnv;
192  nnv.reserve(nns.size());
193  while (!nns.empty())
194  {
195  nnv.push_back(nns.top());
196  nns.pop();
197  }
198  std::reverse(std::begin(nnv), std::end(nnv));
199 
200  nnVector[sampleIdx] = std::move(nnv);
201  }
202 }
203 
206 SampleType SmoteCombine(const SampleType& s1, const SampleType& s2, double position)
207 {
208  auto result = s1;
209  for (size_t i = 0; i < s1.size(); ++i)
210  result[i] = s1[i] + (s2[i] - s1[i]) * position;
211  return result;
212 }
214 
221 void Smote(const SampleVectorType& inSamples, const size_t nbSamples, SampleVectorType& newSamples, const int nbNeighbors, const int seed = std::time(nullptr))
222 {
223  newSamples.resize(nbSamples);
224  const long long nbSamplesLL = static_cast<long long>(nbSamples);
225  NNVectorType nnVector;
226  FindKNNIndices(inSamples, nbNeighbors, nnVector);
227  // The input samples are selected randomly with replacement
228  std::srand(seed);
229 #ifdef _OPENMP
230 #pragma omp parallel for
231 #endif
232  for (long long i = 0; i < nbSamplesLL; ++i)
233  {
234  const auto sampleIdx = std::rand() % (inSamples.size());
235  const auto sample = inSamples[sampleIdx];
236  const auto neighborIdx = nnVector[sampleIdx][std::rand() % nbNeighbors].index;
237  const auto neighbor = inSamples[neighborIdx];
238  newSamples[i] = SmoteCombine(sample, neighbor, std::rand() / double{RAND_MAX});
239  }
240 }
242 
243 } // end namespaces sampleAugmentation
244 } // end namespace otb
245 
246 #endif
otb::sampleAugmentation::NeighborType::index
vcl_size_t index
Definition: otbSampleAugmentation.h:141
otb::sampleAugmentation::FindKNNIndices
void FindKNNIndices(const SampleVectorType &inSamples, const vcl_size_t nbNeighbors, NNVectorType &nnVector)
Definition: otbSampleAugmentation.h:168
otb::sampleAugmentation::NeighborType
Definition: otbSampleAugmentation.h:139
otb::sampleAugmentation::Smote
void Smote(const SampleVectorType &inSamples, const vcl_size_t nbSamples, SampleVectorType &newSamples, const int nbNeighbors, const int seed=std::time(nullptr))
Definition: otbSampleAugmentation.h:221
otb::sampleAugmentation::SmoteCombine
SampleType SmoteCombine(const SampleType &s1, const SampleType &s2, double position)
Definition: otbSampleAugmentation.h:206
otb
The "otb" namespace contains all Orfeo Toolbox (OTB) classes.
Definition: otbJoinContainer.h:32
otb::sampleAugmentation::ReplicateSamples
void ReplicateSamples(const SampleVectorType &inSamples, const vcl_size_t nbSamples, SampleVectorType &newSamples)
Definition: otbSampleAugmentation.h:84
otb::sampleAugmentation::NeighborType::distance
double distance
Definition: otbSampleAugmentation.h:142
otb::sampleAugmentation::NNIndicesType
std::vector< NeighborType > NNIndicesType
Definition: otbSampleAugmentation.h:164
otb::sampleAugmentation::NeighborSorter
Definition: otbSampleAugmentation.h:145
otb::sampleAugmentation::SampleType
std::vector< double > SampleType
Definition: otbSampleAugmentation.h:41
otb::sampleAugmentation::SampleVectorType
std::vector< SampleType > SampleVectorType
Definition: otbSampleAugmentation.h:42
otb::sampleAugmentation::JitterSamples
void JitterSamples(const SampleVectorType &inSamples, const vcl_size_t nbSamples, SampleVectorType &newSamples, float stdFactor=10, const int seed=std::time(nullptr))
Definition: otbSampleAugmentation.h:107
otb::sampleAugmentation::NNVectorType
std::vector< NNIndicesType > NNVectorType
Definition: otbSampleAugmentation.h:165
otb::sampleAugmentation::ComputeSquareDistance
double ComputeSquareDistance(const SampleType &x, const SampleType &y)
Definition: otbSampleAugmentation.h:153
otb::sampleAugmentation::EstimateStds
SampleType EstimateStds(const SampleVectorType &samples)
Definition: otbSampleAugmentation.h:48
otb::sampleAugmentation::NeighborSorter::operator()
constexpr bool operator()(const NeighborType &a, const NeighborType &b) const
Definition: otbSampleAugmentation.h:147