CRAAM  2.0.0
Robust and Approximate Markov Decision Processes
values.hpp
1 
5 #pragma once
6 
7 #include "../RMDP.hpp"
8 #include <functional>
9 #include <type_traits>
10 #include "../cpp11-range-master/range.hpp"
11 
12 namespace craam {
14 namespace algorithms{
15 
16 using namespace std;
17 using namespace util::lang;
18 
19 
20 
21 // *******************************************************
22 // RegularAction computation methods
23 // *******************************************************
24 
33 inline prec_t value_action(const RegularAction& action, const numvec& valuefunction,
34  prec_t discount) {
35  return action.get_outcome().value(valuefunction, discount);
36 }
37 
53 inline prec_t value_action(const RegularAction& action, const numvec& valuefunction,
54  prec_t discount, numvec distribution) {
55  return action.get_outcome().value(valuefunction, discount, distribution);
56 }
57 
58 
59 // *******************************************************
60 // WeightedOutcomeAction computation methods
61 // *******************************************************
62 
71 inline prec_t value_action(const WeightedOutcomeAction& action, numvec const& valuefunction,
72  prec_t discount) {
73  assert(action.get_distribution().size() == action.get_outcomes().size());
74 
75  if(action.get_outcomes().empty())
76  throw invalid_argument("WeightedOutcomeAction with no outcomes");
77 
78  prec_t averagevalue = 0.0;
79  const numvec& distribution = action.get_distribution();
80  for(size_t i = 0; i < action.get_outcomes().size(); i++)
81  averagevalue += distribution[i] * action[i].value(valuefunction, discount);
82  return averagevalue;
83 }
84 
94 inline prec_t value_action(const WeightedOutcomeAction& action, numvec const& valuefunction,
95  prec_t discount, const numvec& distribution) {
96 
97  assert(distribution.size() == action.get_outcomes().size());
98  if(action.get_outcomes().empty()) throw invalid_argument("WeightedOutcomeAction with no outcomes");
99 
100  prec_t averagevalue = 0.0;
101  // TODO: simd?
102  for(size_t i = 0; i < action.get_outcomes().size(); i++)
103  averagevalue += distribution[i] * action[i].value(valuefunction, discount);
104  return averagevalue;
105 }
106 
107 
108 // *******************************************************
109 // State computation methods
110 // *******************************************************
111 
122 template<class AType>
123 inline pair<long,prec_t> value_max_state(const SAState<AType>& state, const numvec& valuefunction,
124  prec_t discount) {
125  if(state.is_terminal())
126  return make_pair(-1,0.0);
127 
128  prec_t maxvalue = -numeric_limits<prec_t>::infinity();
129  long result = -1l;
130 
131  for(size_t i = 0; i < state.size(); i++){
132  auto const& action = state[i];
133 
134  // skip invalid state.get_actions()
135  if(!state.is_valid(i)) continue;
136 
137  auto value = value_action(action, valuefunction, discount);
138  if(value >= maxvalue){
139  maxvalue = value;
140  result = i;
141  }
142  }
143 
144  // if the result has not been changed, that means that all actions are invalid
145  if(result == -1)
146  throw invalid_argument("all actions are invalid.");
147 
148  return make_pair(result, maxvalue);
149 }
150 
160 template<class AType>
161 inline prec_t value_fix_state(const SAState<AType>& state, numvec const& valuefunction,
162  prec_t discount, long actionid) {
163  // this is the terminal state, return 0
164  if(state.is_terminal())
165  return 0;
166  if(actionid < 0 || actionid >= (long) state.get_actions().size())
167  throw range_error("invalid actionid: " + to_string(actionid) + " for action count: " +
168  to_string(state.get_actions().size()) );
169 
170  const auto& action = state[actionid];
171  // cannot assume invalid state.get_actions()
172  if(!state.is_valid(actionid)) throw invalid_argument("Cannot take an invalid action");
173 
174  return value_action(action, valuefunction, discount);
175 }
176 
187 template<class AType>
188 inline prec_t
189 value_fix_state(const SAState<AType>& state, numvec const& valuefunction, prec_t discount,
190  long actionid, numvec distribution) {
191  // this is the terminal state, return 0
192  if(state.is_terminal()) return 0;
193 
194  assert(actionid >= 0 && actionid < long(state.size()));
195 
196  if(actionid < 0 || actionid >= long(state.size())) throw range_error("invalid actionid: "
197  + to_string(actionid) + " for action count: " + to_string(state.get_actions().size()) );
198 
199  const auto& action = state[actionid];
200  // cannot assume that the action is valid
201  if(!state.is_valid(actionid)) throw invalid_argument("Cannot take an invalid action");
202 
203  return value_action(action, valuefunction, discount, distribution);
204 }
205 
206 // *******************************************************
207 // RMDP computation methods
208 // *******************************************************
209 
211 struct Solution {
220 
221  Solution(): valuefunction(0), policy(0), residual(-1),iterations(-1) {};
222 
224  Solution(size_t statecount): valuefunction(statecount, 0.0), policy(statecount, -1), residual(-1),iterations(-1) {};
225 
227  Solution(numvec valuefunction, indvec policy, prec_t residual = -1, long iterations = -1) :
228  valuefunction(move(valuefunction)), policy(move(policy)), residual(residual), iterations(iterations) {};
229 
235  prec_t total_return(const Transition& initial) const{
236  if(initial.max_index() >= (long) valuefunction.size()) throw invalid_argument("Too many indexes in the initial distribution.");
237  return initial.value(valuefunction);
238  };
239 };
240 
241 
242 // **************************************************************************
243 // Helper classes to handle computing of the best response
244 // **************************************************************************
245 
246 /*
247 Regular solution to an MDP
248 
249 Field policy Ignored when size is 0. Otherwise a partial policy. Actions are optimized only in
250  states in which policy = -1, otherwise a fixed value is used.
251 */
253 public:
254  using solution_type = Solution;
255 
258 
260  PolicyDeterministic() : policy(0) {};
261 
265  PolicyDeterministic(indvec policy) : policy(move(policy)) {};
266 
267  Solution new_solution(size_t statecount, numvec valuefunction) const {
268  process_valuefunction(statecount, valuefunction);
269  assert(valuefunction.size() == statecount);
270  Solution solution = Solution(move(valuefunction), process_policy(statecount));
271  return solution;
272  }
273 
277  template<class SType>
278  prec_t update_solution(Solution& solution, const SType& state, long stateid,
279  const numvec& valuefunction, prec_t discount) const{
280  assert(stateid < long(solution.policy.size()));
281 
282  prec_t newvalue;
283  // check whether this state should only be evaluated
284  if(policy.empty() || policy[stateid] < 0){ // optimizing
285  tie(solution.policy[stateid], newvalue) = value_max_state(state, valuefunction, discount);
286  }else{// fixed-action, do not copy
287  return value_fix_state(state, valuefunction, discount, policy[stateid]);
288  }
289  return newvalue;
290  }
291 
294  template<class SType>
295  prec_t update_value(const Solution& solution, const SType& state, long stateid,
296  const numvec& valuefunction, prec_t discount) const{
297 
298  return value_fix_state(state, valuefunction, discount, solution.policy[stateid]);
299  }
300 protected:
301  void process_valuefunction(size_t statecount, numvec& valuefunction) const{
302  // check if the value function is a correct size, and if it is length 0
303  // then creates an appropriate size
304  if(!valuefunction.empty()){
305  if(valuefunction.size() != statecount) throw invalid_argument("Incorrect dimensions of value function.");
306  }else{
307  valuefunction.assign(statecount, 0.0);
308  }
309  }
310  indvec process_policy(size_t statecount) const {
311  // check the dimensions of the policy
312  if(!policy.empty()){
313  if(policy.size() != statecount) throw invalid_argument("Incorrect dimensions of policy function.");
314  return policy;
315  }else{
316  return indvec(statecount, -1);
317  }
318  }
319 };
320 
321 
322 
323 // **************************************************************************
324 // Main solution methods
325 // **************************************************************************
326 
349 template<class SType, class ResponseType = PolicyDeterministic>
350 inline auto vi_gs(const GRMDP<SType>& mdp, prec_t discount,
351  numvec valuefunction=numvec(0), const ResponseType& response = PolicyDeterministic(),
352  unsigned long iterations=MAXITER, prec_t maxresidual=SOLPREC)
353  {
354 
355  const auto& states = mdp.get_states();
356  typename ResponseType::solution_type solution =
357  response.new_solution(states.size(), move(valuefunction));
358 
359  // just quit if there are no states
360  if( mdp.state_count() == 0) return solution;
361 
362  // initialize values
363  prec_t residual = numeric_limits<prec_t>::infinity();
364  size_t i; // iterations defined outside to make them reportable
365 
366  for(i = 0; i < iterations && residual > maxresidual; i++){
367  residual = 0;
368 
369  for(size_t s = 0l; s < states.size(); s++){
370  prec_t newvalue = response.update_solution(solution, states[s], s, solution.valuefunction, discount);
371 
372  residual = max(residual, abs(solution.valuefunction[s] - newvalue));
373  solution.valuefunction[s] = newvalue;
374  }
375  }
376  solution.residual = residual;
377  solution.iterations = i;
378  return solution;
379 }
380 
381 
404 template<class SType, class ResponseType = PolicyDeterministic>
405 inline auto mpi_jac(const GRMDP<SType>& mdp, prec_t discount,
406  const numvec& valuefunction=numvec(0), const ResponseType& response = PolicyDeterministic(),
407  unsigned long iterations_pi=MAXITER, prec_t maxresidual_pi=SOLPREC,
408  unsigned long iterations_vi=MAXITER, prec_t maxresidual_vi=SOLPREC/2,
409  bool print_progress=false) {
410 
411  const auto& states = mdp.get_states();
412  typename ResponseType::solution_type solution =
413  response.new_solution(states.size(), move(valuefunction));
414 
415  // just quit if there are no states
416  if( mdp.state_count() == 0) return solution;
417 
418  numvec oddvalue = solution.valuefunction; // set in even iterations (0 is even)
419  numvec evenvalue = oddvalue; // set in odd iterations
420 
421  numvec residuals(states.size());
422 
423  // residual in the policy iteration part
424  prec_t residual_pi = numeric_limits<prec_t>::infinity();
425 
426  size_t i; // defined here to be able to report the number of iterations
427 
428  // use two vectors for value iteration and copy values back and forth
429  numvec * sourcevalue = & oddvalue;
430  numvec * targetvalue = & evenvalue;
431 
432  for(i = 0; i < iterations_pi; i++){
433 
434  if(print_progress)
435  cout << "Policy iteration " << i << "/" << iterations_pi << ":" << endl;
436 
437  swap(targetvalue, sourcevalue);
438 
439  prec_t residual_vi = numeric_limits<prec_t>::infinity();
440 
441  // update policies
442  #pragma omp parallel for
443  for(auto s = 0l; s < long(states.size()); s++){
444  prec_t newvalue = response.update_solution(solution, states[s], s, *sourcevalue, discount);
445  residuals[s] = abs((*sourcevalue)[s] - newvalue);
446  (*targetvalue)[s] = newvalue;
447  }
448  residual_pi = *max_element(residuals.cbegin(),residuals.cend());
449 
450  if(print_progress) cout << " Bellman residual: " << residual_pi << endl;
451 
452  // the residual is sufficiently small
453  if(residual_pi <= maxresidual_pi)
454  break;
455 
456  if(print_progress) cout << " Value iteration: " << flush;
457  // compute values using value iteration
458 
459  for(size_t j = 0; j < iterations_vi && residual_vi > maxresidual_vi; j++){
460  if(print_progress) cout << "." << flush;
461 
462  swap(targetvalue, sourcevalue);
463 
464  #pragma omp parallel for
465  for(auto s = 0l; s < (long) states.size(); s++){
466  prec_t newvalue = response.update_value(solution, states[s], s, *sourcevalue, discount);
467  residuals[s] = abs((*sourcevalue)[s] - newvalue);
468  (*targetvalue)[s] = newvalue;
469  }
470  residual_vi = *max_element(residuals.begin(),residuals.end());
471  }
472  if(print_progress) cout << endl << " Residual (fixed policy): " << residual_vi << endl << endl;
473  }
474  solution.valuefunction = move(*targetvalue);
475  solution.residual = residual_pi;
476  solution.iterations = i;
477  return solution;
478 }
479 
480 // **************************************************************************
481 // Convenient interface methods
482 // **************************************************************************
483 
484 
504 template<class SType>
505 inline auto solve_vi(const GRMDP<SType>& mdp, prec_t discount,
506  numvec valuefunction=numvec(0), const indvec& policy = numvec(0),
507  unsigned long iterations=MAXITER, prec_t maxresidual=SOLPREC)
508  {
509  return vi_gs<SType, PolicyDeterministic>(mdp, discount, move(valuefunction),
510  PolicyDeterministic(policy), iterations, maxresidual);
511 }
512 
513 
532 template<class SType>
533 inline auto solve_mpi(const GRMDP<SType>& mdp, prec_t discount,
534  const numvec& valuefunction=numvec(0), const indvec& policy = indvec(0),
535  unsigned long iterations_pi=MAXITER, prec_t maxresidual_pi=SOLPREC,
536  unsigned long iterations_vi=MAXITER, prec_t maxresidual_vi=SOLPREC/2,
537  bool print_progress=false) {
538 
539  return mpi_jac<SType, PolicyDeterministic>(mdp, discount, valuefunction, PolicyDeterministic(policy),
540  iterations_pi, maxresidual_pi,
541  iterations_vi, maxresidual_vi,
542  print_progress);
543 }
544 
545 }}
State for sa-rectangular uncertainty (or no uncertainty) in an MDP.
Definition: State.hpp:38
A solution to a plain MDP.
Definition: values.hpp:211
vec_scal_t value_fix_state(const SAState< AType > &state, numvec const &valuefunction, prec_t discount, long actionid, const NatureInstance< T > &nature)
Computes the value of a fixed action and any response of nature.
Definition: robust_values.hpp:161
size_t size() const
Number of actions.
Definition: State.hpp:57
const numvec & get_distribution() const
Returns the baseline distribution over outcomes.
Definition: Action.hpp:361
const vector< Transition > & get_outcomes() const
Returns the list of outcomes.
Definition: Action.hpp:197
prec_t residual
Bellman residual of the computation.
Definition: values.hpp:217
auto solve_mpi(const GRMDP< SType > &mdp, prec_t discount, const numvec &valuefunction=numvec(0), const indvec &policy=indvec(0), unsigned long iterations_pi=MAXITER, prec_t maxresidual_pi=SOLPREC, unsigned long iterations_vi=MAXITER, prec_t maxresidual_vi=SOLPREC/2, bool print_progress=false)
Modified policy iteration using Jacobi value iteration in the inner loop.
Definition: values.hpp:533
bool is_terminal() const
True if the state is considered terminal (no actions).
Definition: State.hpp:119
const vector< SType > & get_states() const
Definition: RMDP.hpp:248
A general robust Markov decision process.
Definition: RMDP.hpp:182
long max_index() const
Returns the maximal indexes involved in the transition.
Definition: Transition.hpp:262
PolicyDeterministic()
All actions will be optimized.
Definition: values.hpp:260
double prec_t
Default precision used throughout the code.
Definition: definitions.hpp:25
prec_t update_solution(Solution &solution, const SType &state, long stateid, const numvec &valuefunction, prec_t discount) const
Computed the Bellman update and updates the solution to the best response It does not update the valu...
Definition: values.hpp:278
numvec valuefunction
Value function.
Definition: values.hpp:213
An action in a robust MDP that allows for outcomes chosen by nature.
Definition: Action.hpp:230
auto mpi_jac(const GRMDP< SType > &mdp, prec_t discount, const numvec &valuefunction=numvec(0), const ResponseType &response=PolicyDeterministic(), unsigned long iterations_pi=MAXITER, prec_t maxresidual_pi=SOLPREC, unsigned long iterations_vi=MAXITER, prec_t maxresidual_vi=SOLPREC/2, bool print_progress=false)
Modified policy iteration using Jacobi value iteration in the inner loop.
Definition: values.hpp:405
vector< prec_t > numvec
Default numerical vector.
Definition: definitions.hpp:28
indvec policy
index of the action to take for each states
Definition: values.hpp:215
Solution(size_t statecount)
Empty solution for a problem with statecount states.
Definition: values.hpp:224
Definition: values.hpp:252
indvec policy
Partial policy specification (action -1 is ignored and optimized)
Definition: values.hpp:257
size_t state_count() const
Number of states.
Definition: RMDP.hpp:225
bool is_valid(long actionid) const
Returns whether the actions is valid.
Definition: State.hpp:100
prec_t update_value(const Solution &solution, const SType &state, long stateid, const numvec &valuefunction, prec_t discount) const
Computes a fixed Bellman update using the current solution policy.
Definition: values.hpp:295
Represents sparse transition probabilities and rewards from a single state.
Definition: Transition.hpp:31
constexpr prec_t SOLPREC
Default solution precision.
Definition: definitions.hpp:40
const Transition & get_outcome(long outcomeid) const
Returns the single outcome.
Definition: Action.hpp:48
Solution(numvec valuefunction, indvec policy, prec_t residual=-1, long iterations=-1)
Empty solution for a problem with a given value function and policy.
Definition: values.hpp:227
prec_t value(numvec const &valuefunction, prec_t discount, numvec probabilities) const
Computes value for the transition and a value function.
Definition: Transition.hpp:202
long iterations
Number of iterations taken.
Definition: values.hpp:219
Action in a regular MDP.
Definition: Action.hpp:31
ind_vec_scal_t value_max_state(const SAState< AType > &state, const numvec &valuefunction, prec_t discount, const NatureInstance< T > &nature)
Finds the greedy action and its value for the given value function.
Definition: robust_values.hpp:195
const vector< AType > & get_actions() const
Returns set of all actions.
Definition: State.hpp:116
PolicyDeterministic(indvec policy)
A partial policy that can be used to fix some actions policy[s] = -1 means that the action should be ...
Definition: values.hpp:265
auto vi_gs(const GRMDP< SType > &mdp, prec_t discount, numvec valuefunction=numvec(0), const ResponseType &response=PolicyDeterministic(), unsigned long iterations=MAXITER, prec_t maxresidual=SOLPREC)
Gauss-Seidel variant of value iteration (not parallelized).
Definition: values.hpp:350
vector< long > indvec
Default index vector.
Definition: definitions.hpp:31
constexpr unsigned long MAXITER
Default number of iterations.
Definition: definitions.hpp:43
auto solve_vi(const GRMDP< SType > &mdp, prec_t discount, numvec valuefunction=numvec(0), const indvec &policy=numvec(0), unsigned long iterations=MAXITER, prec_t maxresidual=SOLPREC)
Gauss-Seidel variant of value iteration (not parallelized).
Definition: values.hpp:505
Main namespace which includes modeling a solving functionality.
Definition: Action.hpp:18
prec_t total_return(const Transition &initial) const
Computes the total return of the solution given the initial distribution.
Definition: values.hpp:235
vec_scal_t value_action(const RegularAction &action, const numvec &valuefunction, prec_t discount, const NatureInstance< T > &nature)
Computes an ambiguous value (e.g.
Definition: robust_values.hpp:94