CRAAM  2.0.0
Robust and Approximate Markov Decision Processes
robust_values.hpp
1 
4 #pragma once
5 
6 #include "../RMDP.hpp"
7 #include "values.hpp"
8 #include <functional>
9 #include <type_traits>
10 #include "../cpp11-range-master/range.hpp"
11 
12 namespace craam { namespace algorithms{
13 
14 using namespace std;
15 using namespace util::lang;
16 
17 // *******************************************************
18 // Nature definitions
19 // *******************************************************
20 
33 template<class T>
34 using NatureResponse = vec_scal_t (*)(numvec const& v, numvec const& p, T threshold);
35 
39 template<class T>
40 using NatureInstance = pair<NatureResponse<T>, T>;
41 
42 
44 inline vec_scal_t robust_l1(const numvec& v, const numvec& p, prec_t threshold){
45  assert(v.size() == p.size());
46  return worstcase_l1(v,p,threshold);
47 }
48 
50 inline vec_scal_t optimistic_l1(const numvec& v, const numvec& p, prec_t threshold){
51  assert(v.size() == p.size());
52  //TODO: this could be faster without copying the vector and just modifying the function
53  numvec minusv(v.size());
54  transform(begin(v), end(v), begin(minusv), negate<prec_t>());
55  auto&& result = worstcase_l1(minusv,p,threshold);
56  return make_pair(result.first, -result.second);
57 }
58 
60 template<class T>
61 inline vec_scal_t robust_unbounded(const numvec& v, const numvec& p, T){
62  assert(v.size() == p.size());
63  numvec dist(v.size(),0.0);
64  long index = min_element(begin(v), end(v)) - begin(v);
65  dist[index] = 1;
66  return make_pair(dist,v[index]);
67 }
68 
70 template<class T>
71 inline vec_scal_t optimistic_unbounded(const numvec& v, const numvec& p, T){
72  assert(v.size() == p.size());
73  numvec dist(v.size(),0.0);
74  long index = max_element(begin(v), end(v)) - begin(v);
75  dist[index] = 1;
76  return make_pair(dist,v[index]);
77 }
78 
79 // *******************************************************
80 // RegularAction computation methods
81 // *******************************************************
82 
83 
93 template<class T>
94 inline vec_scal_t value_action(const RegularAction& action, const numvec& valuefunction,
95  prec_t discount, const NatureInstance<T>& nature){
96 
97  const numvec& rewards = action.get_outcome().get_rewards();
98  const indvec& nonzero_indices = action.get_outcome().get_indices();
99 
100  numvec qvalues(rewards.size()); // values for individual states - used by nature.
101 
102  #pragma omp simd
103  for(size_t i = 0; i < rewards.size(); i++){
104  qvalues[i] = rewards[i] + discount * valuefunction[nonzero_indices[i]];
105  }
106 
107  return nature.first(qvalues, action.get_outcome().get_probabilities(), nature.second);
108 }
109 
110 
111 
112 // *******************************************************
113 // WeightedOutcomeAction computation methods
114 // *******************************************************
115 
127 template<class T>
128 inline vec_scal_t value_action(const WeightedOutcomeAction& action, numvec const& valuefunction,
129  prec_t discount, const NatureInstance<T> nature) {
130 
131  assert(action.get_distribution().size() == action.get_outcomes().size());
132 
133  if(action.get_outcomes().empty())
134  throw invalid_argument("Action with no action.get_outcomes().");
135 
136  numvec outcomevalues(action.size());
137  for(size_t i = 0; i < action.size(); i++)
138  outcomevalues[i] = action[i].value(valuefunction, discount);
139 
140  return nature.first(outcomevalues, action.get_distribution(), nature.second);
141 }
142 
143 
144 // *******************************************************
145 // State computation methods
146 // *******************************************************
147 
148 
159 template<class AType, class T>
160 inline vec_scal_t
161 value_fix_state(const SAState<AType>& state, numvec const& valuefunction, prec_t discount,
162  long actionid, const NatureInstance<T>& nature) {
163  // this is the terminal state, return 0
164  if(state.is_terminal()) return make_pair(numvec(0),0);
165 
166  assert(actionid >= 0 && actionid < long(state.size()));
167 
168  if(actionid < 0 || actionid >= (long) state.size()) throw range_error("invalid actionid: " + to_string(actionid) + " for action count: " + to_string(state.get_actions().size()) );
169 
170  const auto& action = state[actionid];
171  // cannot assume that the action is valid
172  if(!state.is_valid(actionid)) throw invalid_argument("Cannot take an invalid action");
173 
174  return value_action(action, valuefunction, discount, nature);
175 }
176 
177 
178 
193 template<typename AType, typename T>
194 inline ind_vec_scal_t
195 value_max_state(const SAState<AType>& state, const numvec& valuefunction,
196  prec_t discount, const NatureInstance<T>& nature) {
197 
198  if(state.is_terminal())
199  return make_tuple(-1,numvec(),0);
200 
201  prec_t maxvalue = -numeric_limits<prec_t>::infinity();
202 
203  long result = -1;
204  numvec result_outcome;
205 
206  for(size_t i = 0; i < state.get_actions().size(); i++){
207  const auto& action = state[i];
208 
209  // skip invalid state.get_actions()
210  if(!state.is_valid(i)) continue;
211 
212  auto value = value_action(action, valuefunction, discount, nature);
213  if(value.second > maxvalue){
214  maxvalue = value.second;
215  result = i;
216  result_outcome = move(value.first);
217  }
218  }
219 
220  // if the result has not been changed, that means that all actions are invalid
221  if(result == -1)
222  throw invalid_argument("all actions are invalid.");
223 
224  return make_tuple(result,result_outcome,maxvalue);
225 }
226 
227 
228 // **************************************************************************
229 // Helper classes to handle computing of the best response
230 // **************************************************************************
231 
233 struct SolutionRobust : public Solution {
236  vector<numvec> natpolicy;
237 
239  SolutionRobust() : Solution(), natpolicy(0) {};
240 
242  SolutionRobust(size_t statecount): Solution(statecount), natpolicy(statecount, numvec(0)) {};
243 
245  SolutionRobust(numvec valuefunction, indvec policy):
246  Solution(move(valuefunction), move(policy)),
247  natpolicy(this->valuefunction.size(), numvec(0)) {};
248 
249  SolutionRobust(numvec valuefunction, indvec policy,
250  vector<numvec> natpolicy, prec_t residual = -1, long iterations = -1) :
251  Solution(move(valuefunction), move(policy), residual, iterations),
252  natpolicy(move(natpolicy)) {};
253 };
259 template<class T>
261 public:
263 
265  vector<NatureInstance<T>> natspec;
266 
268  PolicyNature(indvec policy, vector<NatureInstance<T>> natspec):
269  PolicyDeterministic(move(policy)), natspec(move(natspec)) {};
270 
272  PolicyNature(vector<NatureInstance<T>> natspec):
273  PolicyDeterministic(indvec(0)), natspec(move(natspec)) {};
274 
276  SolutionRobust new_solution(size_t statecount, numvec valuefunction) const {
277  if(natspec.size() != statecount)
278  throw invalid_argument("Size of nature specification does not match the number of states.");
279 
280  process_valuefunction(statecount, valuefunction);
281  SolutionRobust solution = SolutionRobust(move(valuefunction), process_policy(statecount));
282  return solution;
283  }
284 
288  template<class SType>
289  prec_t update_solution(SolutionRobust& solution, const SType& state, long stateid,
290  const numvec& valuefunction, prec_t discount) const{
291 
292  prec_t newvalue = 0;
293  // check whether this state should only be evaluated or also optimized
294  if(policy.empty() || policy[stateid] < 0){ // optimizing
295  tie(solution.policy[stateid], solution.natpolicy[stateid], newvalue) = value_max_state(state, valuefunction, discount, natspec[stateid]);
296  }else{// fixed-action, do not copy
297  prec_t newvalue;
298  tie(solution.natpolicy[stateid], newvalue) = value_fix_state(state, valuefunction, discount, policy[stateid], natspec[stateid]);
299  }
300  return newvalue;
301  }
302 
305  template<class SType>
306  prec_t update_value(const SolutionRobust& solution, const SType& state, long stateid,
307  const numvec& valuefunction, prec_t discount) const{
308 
309  return value_fix_state(state, valuefunction, discount, solution.policy[stateid],
310  solution.natpolicy[stateid]);
311  }
312 
313 };
314 
315 
317 template<class T>
319  T threshold){
320  return PolicyNature<T>(vector<NatureInstance<T>>(statecount, make_pair(nature, threshold)));
321 }
322 
324 template<class Model, class T>
326  T threshold){
327  return PolicyNature<T>(vector<NatureInstance<T>>(m.state_count(), make_pair(nature, threshold)));
328 }
329 
330 
331 // **************************************************************************
332 // Convenient interface methods
333 // **************************************************************************
334 
335 namespace internal{
336 
337 template <class T1, class T2>
338 vector<pair<T1,T2>> zip(const vector<T1>& v1, const vector<T2>& v2){
339  assert(v1.size() == v2.size());
340  vector<pair<T1,T2>> result(v1.size());
341  for(size_t i=0; i< v1.size(); i++){
342  result[i] = make_pair(v1[i], v2[i]);
343  }
344  return result;
345 }
346 
347 template <class T1, class T2>
348 vector<pair<T1,T2>> zip(const T1& v1, const vector<T2>& v2){
349  vector<pair<T1,T2>> result(v2.size());
350  for(size_t i=0; i< v2.size(); i++){
351  result[i] = make_pair(v1, v2[i]);
352  }
353  return result;
354 }
355 }
356 
380 template<class SType, class T = prec_t >
381 inline auto rsolve_vi(const GRMDP<SType>& mdp, prec_t discount,
382  const vector<NatureResponse<T>>& nature, const vector<T>& thresholds,
383  numvec valuefunction=numvec(0), const indvec& policy = numvec(0),
384  unsigned long iterations=MAXITER, prec_t maxresidual=SOLPREC)
385  {
386  assert(nature.size() == thresholds.size());
387  assert(nature.size() == mdp.state_count());
388 
389  return vi_gs<SType, PolicyNature<T>>(mdp, discount, move(valuefunction),
390  PolicyNature<T>(policy,internal::zip(nature,thresholds)),
391  iterations, maxresidual);
392 }
393 
395 template<class SType, class T = prec_t >
396 inline auto rsolve_vi(const GRMDP<SType>& mdp, prec_t discount,
397  const NatureResponse<T>& nature, const vector<T>& thresholds,
398  numvec valuefunction=numvec(0), const indvec& policy = numvec(0),
399  unsigned long iterations=MAXITER, prec_t maxresidual=SOLPREC)
400  {
401  assert(nature.size() == thresholds.size());
402  assert(nature.size() == mdp.state_count());
403 
404  return vi_gs<SType, PolicyNature<T>>(mdp, discount, move(valuefunction),
405  PolicyNature<T>(policy,internal::zip(nature,thresholds)),
406  iterations, maxresidual);
407 }
408 
409 
410 
433 template<class SType, class T = prec_t>
434 inline auto rsolve_mpi(const GRMDP<SType>& mdp, prec_t discount,
435  const vector<NatureResponse<T>>& nature, const vector<T>& thresholds,
436  const numvec& valuefunction=numvec(0), const indvec& policy = indvec(0),
437  unsigned long iterations_pi=MAXITER, prec_t maxresidual_pi=SOLPREC,
438  unsigned long iterations_vi=MAXITER, prec_t maxresidual_vi=SOLPREC/2,
439  bool print_progress=false) {
440  assert(nature.size() == thresholds.size());
441  assert(nature.size() == mdp.state_count());
442 
443 
444  return mpi_jac<SType, PolicyNature<T>>(mdp, discount, valuefunction,
445  PolicyNature<T>(policy,internal::zip(nature,thresholds)),
446  iterations_pi, maxresidual_pi,
447  iterations_vi, maxresidual_vi,
448  print_progress);
449 }
450 
452 template<class SType, class T = prec_t>
453 inline auto rsolve_mpi(const GRMDP<SType>& mdp, prec_t discount,
454  const NatureResponse<T>& nature, const vector<T>& thresholds,
455  const numvec& valuefunction=numvec(0), const indvec& policy = indvec(0),
456  unsigned long iterations_pi=MAXITER, prec_t maxresidual_pi=SOLPREC,
457  unsigned long iterations_vi=MAXITER, prec_t maxresidual_vi=SOLPREC/2,
458  bool print_progress=false) {
459  assert(nature.size() == thresholds.size());
460  assert(nature.size() == mdp.state_count());
461 
462 
463  return mpi_jac<SType, PolicyNature<T>>(mdp, discount, valuefunction,
464  PolicyNature<T>(policy,internal::zip(nature,thresholds)),
465  iterations_pi, maxresidual_pi,
466  iterations_vi, maxresidual_vi,
467  print_progress);
468 }
469 
481  if(nature == "robust_unbounded") return robust_unbounded;
482  if(nature == "optimistic_unbounded") return optimistic_unbounded;
483  if(nature == "robust_l1") return robust_l1;
484  if(nature == "optimistic_l1") return optimistic_l1;
485  throw invalid_argument("Unknown nature.");
486 }
487 
488 
489 
490 }}
State for sa-rectangular uncertainty (or no uncertainty) in an MDP.
Definition: State.hpp:38
A solution to a plain MDP.
Definition: values.hpp:211
vec_scal_t value_fix_state(const SAState< AType > &state, numvec const &valuefunction, prec_t discount, long actionid, const NatureInstance< T > &nature)
Computes the value of a fixed action and any response of nature.
Definition: robust_values.hpp:161
auto rsolve_mpi(const GRMDP< SType > &mdp, prec_t discount, const vector< NatureResponse< T >> &nature, const vector< T > &thresholds, const numvec &valuefunction=numvec(0), const indvec &policy=indvec(0), unsigned long iterations_pi=MAXITER, prec_t maxresidual_pi=SOLPREC, unsigned long iterations_vi=MAXITER, prec_t maxresidual_vi=SOLPREC/2, bool print_progress=false)
Modified policy iteration using Jacobi value iteration in the inner loop.
Definition: robust_values.hpp:434
prec_t update_solution(SolutionRobust &solution, const SType &state, long stateid, const numvec &valuefunction, prec_t discount) const
Computes the Bellman update and updates the solution to the best response It does not update the valu...
Definition: robust_values.hpp:289
size_t size() const
Number of actions.
Definition: State.hpp:57
const numvec & get_distribution() const
Returns the baseline distribution over outcomes.
Definition: Action.hpp:361
const vector< Transition > & get_outcomes() const
Returns the list of outcomes.
Definition: Action.hpp:197
size_t size() const
Returns number of outcomes.
Definition: Action.hpp:185
PolicyNature< T > uniform_nature(size_t statecount, NatureResponse< T > nature, T threshold)
A helper function that simply copies a nature specification across all states.
Definition: robust_values.hpp:318
bool is_terminal() const
True if the state is considered terminal (no actions).
Definition: State.hpp:119
A general robust Markov decision process.
Definition: RMDP.hpp:182
const indvec & get_indices() const
Indices with positive probabilities.
Definition: Transition.hpp:323
A robust solution to a robust or regular MDP.
Definition: robust_values.hpp:233
double prec_t
Default precision used throughout the code.
Definition: definitions.hpp:25
An action in a robust MDP that allows for outcomes chosen by nature.
Definition: Action.hpp:230
vector< prec_t > numvec
Default numerical vector.
Definition: definitions.hpp:28
SolutionRobust(numvec valuefunction, indvec policy)
Empty SolutionRobust for a problem with policy and value function.
Definition: robust_values.hpp:245
vec_scal_t optimistic_l1(const numvec &v, const numvec &p, prec_t threshold)
L1 optimistic response.
Definition: robust_values.hpp:50
indvec policy
index of the action to take for each states
Definition: values.hpp:215
The class abstracts some operations of value / policy iteration in order to generalize to various typ...
Definition: robust_values.hpp:260
vec_scal_t robust_l1(const numvec &v, const numvec &p, prec_t threshold)
L1 robust response.
Definition: robust_values.hpp:44
Definition: values.hpp:252
const numvec & get_probabilities() const
Returns list of positive probabilities for indexes returned by get_indices.
Definition: Transition.hpp:332
SolutionRobust new_solution(size_t statecount, numvec valuefunction) const
Constructs a new robust solution.
Definition: robust_values.hpp:276
pair< numvec, prec_t > vec_scal_t
Pair of a vector and a scalar.
Definition: definitions.hpp:34
size_t state_count() const
Number of states.
Definition: RMDP.hpp:225
bool is_valid(long actionid) const
Returns whether the actions is valid.
Definition: State.hpp:100
SolutionRobust()
Empty SolutionRobust.
Definition: robust_values.hpp:239
constexpr prec_t SOLPREC
Default solution precision.
Definition: definitions.hpp:40
const Transition & get_outcome(long outcomeid) const
Returns the single outcome.
Definition: Action.hpp:48
vector< numvec > natpolicy
Randomized policy of nature, probabilities only for states that have non-zero probability in the MDP ...
Definition: robust_values.hpp:236
NatureResponse< prec_t > string_to_nature(string nature)
Converts a string representation of nature response to the appropriate nature response call...
Definition: robust_values.hpp:480
pair< numvec, prec_t > worstcase_l1(numvec const &z, numvec const &q, prec_t t)
Computes the solution of: min_p p^T * z s.t.
Definition: definitions.hpp:111
vector< NatureInstance< T > > natspec
Specification of natures response (the function that nature computes, could be different for each sta...
Definition: robust_values.hpp:265
tuple< prec_t, numvec, prec_t > ind_vec_scal_t
Tuple of a index, vector and a scalar.
Definition: definitions.hpp:37
auto rsolve_vi(const GRMDP< SType > &mdp, prec_t discount, const vector< NatureResponse< T >> &nature, const vector< T > &thresholds, numvec valuefunction=numvec(0), const indvec &policy=numvec(0), unsigned long iterations=MAXITER, prec_t maxresidual=SOLPREC)
Gauss-Seidel variant of value iteration (not parallelized).
Definition: robust_values.hpp:381
helper functions
Definition: State.hpp:204
SolutionRobust(size_t statecount)
Empty SolutionRobust for a problem with statecount states.
Definition: robust_values.hpp:242
vec_scal_t optimistic_unbounded(const numvec &v, const numvec &p, T)
best outcome, threshold is ignored
Definition: robust_values.hpp:71
const numvec & get_rewards() const
Rewards for indices with positive probabilities returned by get_indices.
Definition: Transition.hpp:337
Action in a regular MDP.
Definition: Action.hpp:31
ind_vec_scal_t value_max_state(const SAState< AType > &state, const numvec &valuefunction, prec_t discount, const NatureInstance< T > &nature)
Finds the greedy action and its value for the given value function.
Definition: robust_values.hpp:195
const vector< AType > & get_actions() const
Returns set of all actions.
Definition: State.hpp:116
vec_scal_t(*)(numvec const &v, numvec const &p, T threshold) NatureResponse
Function representing constraints on nature.
Definition: robust_values.hpp:34
PolicyNature(vector< NatureInstance< T >> natspec)
Constructs the object from a policy and a specification of nature.
Definition: robust_values.hpp:272
prec_t update_value(const SolutionRobust &solution, const SType &state, long stateid, const numvec &valuefunction, prec_t discount) const
Computes a fixed Bellman update using the current solution policy.
Definition: robust_values.hpp:306
PolicyNature(indvec policy, vector< NatureInstance< T >> natspec)
Constructs the object from a policy and a specification of nature.
Definition: robust_values.hpp:268
vector< long > indvec
Default index vector.
Definition: definitions.hpp:31
constexpr unsigned long MAXITER
Default number of iterations.
Definition: definitions.hpp:43
Main namespace which includes modeling a solving functionality.
Definition: Action.hpp:18
pair< NatureResponse< T >, T > NatureInstance
Represents an instance of nature that can be used to directly compute the response.
Definition: robust_values.hpp:40
vec_scal_t robust_unbounded(const numvec &v, const numvec &p, T)
worst outcome, threshold is ignored
Definition: robust_values.hpp:61
vec_scal_t value_action(const RegularAction &action, const numvec &valuefunction, prec_t discount, const NatureInstance< T > &nature)
Computes an ambiguous value (e.g.
Definition: robust_values.hpp:94