Thunderbots Project
Loading...
Searching...
No Matches
gradient_descent_optimizer.hpp
1#pragma once
2
3#include <algorithm>
4#include <array>
5#include <cmath>
6#include <functional>
7
34template <size_t NUM_PARAMS>
36{
37 public:
38 using ParamArray = std::array<double, NUM_PARAMS>;
39
40 // Almost always good values for the decay rates, taken from:
41 // https://www.ruder.io/optimizing-gradient-descent/#adam
42 static constexpr double DEFAULT_PAST_GRADIENT_DECAY_RATE = 0.9;
43 static constexpr double DEFAULT_PAST_SQUARED_GRADIENT_DECAY_RATE = 0.999;
44
45 // Default step size for approximating the gradient of functions
46 static constexpr double DEFAULT_GRADIENT_APPROX_STEP_SIZE = 0.00001;
47
64 ParamArray param_weights = GradientDescentOptimizer<NUM_PARAMS>::ParamArray{1},
65 double gradient_approx_step_size = DEFAULT_GRADIENT_APPROX_STEP_SIZE,
66 double past_gradient_decay_rate = DEFAULT_PAST_GRADIENT_DECAY_RATE,
67 double past_squared_gradient_decay_rate =
68 DEFAULT_PAST_SQUARED_GRADIENT_DECAY_RATE);
69
83 ParamArray maximize(std::function<double(ParamArray)> objective_function,
84 ParamArray initial_value, unsigned int num_iters);
85
99 ParamArray minimize(std::function<double(ParamArray)> objective_function,
100 ParamArray initial_value, unsigned int num_iters);
101
102
103 private:
120 ParamArray followGradient(
121 std::function<double(ParamArray)> objective_function, ParamArray initial_value,
122 unsigned int num_iters,
123 std::function<double(double, double)> gradient_movement_func);
124
133 ParamArray approximateGradient(ParamArray params,
134 std::function<double(ParamArray)> objective_function);
135
136 // This constant is used to prevent division by 0 in our implementation of Adam
137 // (gradient descent)
138 static constexpr double eps = 1e-8;
139
140 // Weights used to normalize parameters. See class javadoc comment above for
141 // details
142 ParamArray param_weights;
143
144 // Decay rates used for Adam (see class description for details)
145 double past_gradient_decay_rate;
146 double past_squared_gradient_decay_rate;
147
148 // The size of step of take when numerically approximating the derivative of
149 // an objective function
150 double gradient_approx_step_size;
151};
152
163template <size_t NUM_PARAMS>
165 std::array<double, NUM_PARAMS> param_weights, double gradient_approx_step_size,
166 double past_gradient_decay_rate, double past_squared_gradient_decay_rate)
167 : param_weights(param_weights),
168 past_gradient_decay_rate(past_gradient_decay_rate),
169 past_squared_gradient_decay_rate(past_squared_gradient_decay_rate),
170 gradient_approx_step_size(gradient_approx_step_size)
171{
172}
173
174template <size_t NUM_PARAMS>
176 std::function<double(std::array<double, NUM_PARAMS>)> objective_function,
177 std::array<double, NUM_PARAMS> initial_value, unsigned int num_iters)
178{
179 return followGradient(
180 objective_function, initial_value, num_iters,
181 [](double curr_value, double step) { return curr_value + step; });
182}
183
184template <size_t NUM_PARAMS>
186 std::function<double(std::array<double, NUM_PARAMS>)> objective_function,
187 std::array<double, NUM_PARAMS> initial_value, unsigned int num_iters)
188{
189 return followGradient(
190 objective_function, initial_value, num_iters,
191 [](double curr_value, double step) { return curr_value - step; });
192}
193
194template <size_t NUM_PARAMS>
195std::array<double, NUM_PARAMS> GradientDescentOptimizer<NUM_PARAMS>::followGradient(
196 std::function<double(std::array<double, NUM_PARAMS>)> objective_function,
197 std::array<double, NUM_PARAMS> initial_value, unsigned int num_iters,
198 std::function<double(double, double)> gradient_movement_func)
199{
200 // Implementation of the "Adam" algorithm. See Javadoc class comment for this
201 // class (in the header) for details
202
203 // This is basically just to change the name so the below code reads more nicely
204 ParamArray params = initial_value;
205
206 // The past gradient and squared gradient averages for each parameter
207 ParamArray past_gradient_averages = {0};
208 ParamArray past_squared_gradient_averages = {0};
209
210 for (unsigned iter = 0; iter < num_iters; iter++)
211 {
212 ParamArray gradient = approximateGradient(params, objective_function);
213
214 // Get the squared gradient
215 ParamArray squared_gradient = {0};
216 for (unsigned int i = 0; i < NUM_PARAMS; i++)
217 {
218 squared_gradient.at(i) = std::pow(gradient.at(i), 2);
219 }
220
221 // Update past gradient and gradient squared averages
222 for (unsigned int i = 0; i < NUM_PARAMS; i++)
223 {
224 past_gradient_averages.at(i) =
225 past_gradient_decay_rate * past_gradient_averages.at(i) +
226 (1 - past_gradient_decay_rate) * gradient.at(i);
227 past_squared_gradient_averages.at(i) =
228 past_squared_gradient_decay_rate * past_squared_gradient_averages.at(i) +
229 (1 - past_squared_gradient_decay_rate) * squared_gradient.at(i);
230 }
231
232 // Create the bias corrected gradient and gradient square averages
233 ParamArray bias_corrected_past_gradient_averages = {0};
234 ParamArray bias_corrected_past_squared_gradient_averages = {0};
235 for (unsigned int i = 0; i < NUM_PARAMS; i++)
236 {
237 bias_corrected_past_gradient_averages.at(i) =
238 past_gradient_averages.at(i) /
239 (1 - std::pow(past_gradient_decay_rate, 2));
240 bias_corrected_past_squared_gradient_averages.at(i) =
241 past_squared_gradient_averages.at(i) /
242 (1 - std::pow(past_squared_gradient_decay_rate, 2));
243 }
244
245 // Step each param in the direction of the gradient using the operator
246 // given to this function
247 for (unsigned int i = 0; i < NUM_PARAMS; i++)
248 {
249 params.at(i) = gradient_movement_func(
250 params.at(i),
251 param_weights.at(i) * bias_corrected_past_gradient_averages.at(i) /
252 (std::sqrt(bias_corrected_past_squared_gradient_averages.at(i)) +
253 eps));
254 }
255 }
256
257 return params;
258}
259
260template <size_t NUM_PARAMS>
262 std::array<double, NUM_PARAMS> params,
263 std::function<double(std::array<double, NUM_PARAMS>)> objective_function)
264{
265 ParamArray gradient = {0};
266 double curr_function_value = objective_function(params);
267
268 for (unsigned i = 0; i < NUM_PARAMS; i++)
269 {
270 auto test_params = params;
271 test_params.at(i) += gradient_approx_step_size * param_weights.at(i);
272 double new_function_value = objective_function(test_params);
273 gradient.at(i) =
274 (new_function_value - curr_function_value) / gradient_approx_step_size;
275 }
276
277 return gradient;
278}
Definition gradient_descent_optimizer.hpp:36
GradientDescentOptimizer(ParamArray param_weights=GradientDescentOptimizer< NUM_PARAMS >::ParamArray{1}, double gradient_approx_step_size=DEFAULT_GRADIENT_APPROX_STEP_SIZE, double past_gradient_decay_rate=DEFAULT_PAST_GRADIENT_DECAY_RATE, double past_squared_gradient_decay_rate=DEFAULT_PAST_SQUARED_GRADIENT_DECAY_RATE)
Definition gradient_descent_optimizer.hpp:164
ParamArray maximize(std::function< double(ParamArray)> objective_function, ParamArray initial_value, unsigned int num_iters)
Definition gradient_descent_optimizer.hpp:175
ParamArray minimize(std::function< double(ParamArray)> objective_function, ParamArray initial_value, unsigned int num_iters)
Definition gradient_descent_optimizer.hpp:185