CRAAM  2.0.0
Robust and Approximate Markov Decision Processes
ImMDP.hpp
1 #pragma once
2 
3 #include "RMDP.hpp"
4 #include "Transition.hpp"
5 #include "modeltools.hpp"
6 #include "algorithms/values.hpp"
7 #include "algorithms/robust_values.hpp"
8 #include "algorithms/occupancies.hpp"
9 
10 #include <vector>
11 #include <memory>
12 #include <random>
13 #include <algorithm>
14 #include <iostream>
15 #include <iterator>
16 
17 #include "cpp11-range-master/range.hpp"
18 
19 namespace craam{
21 namespace impl{
22 
23 using namespace std;
24 using namespace util::lang;
25 using namespace craam::algorithms;
26 
27 template<typename T>
28 T max_value(vector<T> x){
29  return (x.size() > 0) ? *max_element(x.begin(), x.end()) : -1;
30 }
31 
37 class MDPI{
38 
39 public:
54  MDPI(const shared_ptr<const MDP>& mdp, const indvec& state2observ, const Transition& initial):
55  mdp(mdp), state2observ(state2observ), initial(initial),
56  obscount(1+max_value(state2observ)),
57  action_counts(obscount, -1){
58  check_parameters(*mdp, state2observ, initial);
59 
60  for(auto state : range((size_t) 0, mdp->state_count())){
61  auto obs = state2observ[state];
62 
63  // check the number of actions
64  auto ac = mdp->get_state(state).action_count();
65  if(action_counts[obs] >= 0){
66  if(action_counts[obs] != (long) ac){
67  throw invalid_argument("Inconsistent number of actions: " + to_string(ac) +
68  " instead of " + to_string(action_counts[obs]) +
69  " in state " + to_string(state));
70  }
71  }else{
72  action_counts[obs] = ac;
73  }
74  }
75  }
76 
88  MDPI(const MDP& mdp, const indvec& state2observ, const Transition& initial)
89  : MDPI(make_shared<const MDP>(mdp),state2observ, initial){}
90 
91  size_t obs_count() const { return obscount; };
92  size_t state_count() const {return mdp->state_count(); };
93  long state2obs(long state){return state2observ[state];};
94  size_t action_count(long obsid) {return action_counts[obsid];};
95 
102  indvec obspol2statepol(const indvec& obspol) const{
103  indvec statepol(state_count());
104  obspol2statepol(obspol, statepol);
105  return statepol;
106  }
107 
114  void obspol2statepol(const indvec& obspol, indvec& statepol) const{
115  assert(obspol.size() == (size_t) obscount);
116  assert(mdp->state_count() == statepol.size());
117 
118  for(auto s : range((size_t)0, state_count())){
119  statepol[s] = obspol[state2observ[s]];
120  }
121  }
122 
129  if((size_t) tran.max_index() >= state_count())
130  throw invalid_argument("Transition to a non-existing state.");
131  Transition result;
132  for(auto i : range((size_t)0, tran.size())){
133  const long state = tran.get_indices()[i];
134  const prec_t prob = tran.get_probabilities()[i];
135  const prec_t reward = tran.get_rewards()[i];
136 
137  result.add_sample(state2obs(state), prob, reward);
138  }
139  return result;
140  }
141 
143  shared_ptr<const MDP> get_mdp() {return mdp;};
144 
146  Transition get_initial() const {return initial;};
147 
149  indvec random_policy(random_device::result_type seed = random_device{}()){
150  indvec policy(obscount, -1);
151 
152  default_random_engine gen(seed);
153 
154  for(auto obs : range(0l, obscount)){
155  auto ac = action_counts[obs];
156  if(ac == 0)
157  continue;
158 
159  uniform_int_distribution<int> dist(0,ac-1);
160  policy[obs] = dist(gen);
161  }
162 
163  return policy;
164  }
165 
172  prec_t total_return(prec_t discount, prec_t precision=SOLPREC) const{
173  auto&& sol = mpi_jac(*mdp, discount, numvec(0), PolicyDeterministic(), MAXITER, precision);
174  return sol.total_return(initial);
175  }
176 
177  // save and load description.
186  void to_csv(ostream& output_mdp, ostream& output_state2obs, ostream& output_initial,
187  bool headers = true) const{
188  // save the MDP
189  mdp->to_csv(output_mdp, headers);
190  // save state maps
191  if(headers){
192  output_state2obs << "idstate,idobs" << endl;
193  }
194  for(auto i : indices(state2observ)){
195  output_state2obs << i << "," << state2observ[i] << endl;
196  }
197 
198  // save the initial distribution
199  if(headers){
200  output_initial << "idstate,probability" << endl;
201  }
202  const indvec& inindices = initial.get_indices();
203  const numvec& probabilities = initial.get_probabilities();
204 
205  for(auto i : indices(inindices)){
206  output_initial << inindices[i] << "," << probabilities[i] << endl;
207  }
208 
209  }
210 
219  void to_csv_file(const string& output_mdp, const string& output_state2obs,
220  const string& output_initial, bool headers = true) const{
221 
222  // open file streams
223  ofstream ofs_mdp(output_mdp),
224  ofs_state2obs(output_state2obs),
225  ofs_initial(output_initial);
226 
227  // save the data
228  to_csv(ofs_mdp, ofs_state2obs, ofs_initial, headers);
229 
230  // close streams
231  ofs_mdp.close(); ofs_state2obs.close(); ofs_initial.close();
232  }
233 
244  template<typename T = MDPI>
245  static unique_ptr<T> from_csv(istream& input_mdp, istream& input_state2obs,
246  istream& input_initial, bool headers = true){
247  // read mdp
248  MDP mdp;
249  craam::from_csv(mdp,input_mdp);
250 
251  // read state2obs
252  string line;
253  if(headers) input_state2obs >> line; // skip the header
254 
255  indvec state2obs(mdp.state_count());
256  input_state2obs >> line;
257  while(input_state2obs.good()){
258  string cellstring;
259  stringstream linestream(line);
260 
261  getline(linestream, cellstring, ',');
262  auto idstate = stoi(cellstring);
263  getline(linestream, cellstring, ',');
264  auto idobs = stoi(cellstring);
265  state2obs[idstate] = idobs;
266 
267  input_state2obs >> line;
268  }
269 
270  // read initial distribution
271  if(headers) input_initial >> line; // skip the header
272 
273  Transition initial;
274  input_initial >> line;
275  while(input_initial.good()){
276  string cellstring;
277  stringstream linestream(line);
278 
279  getline(linestream, cellstring, ',');
280  auto idstate = stoi(cellstring);
281  getline(linestream, cellstring, ',');
282  auto prob = stof(cellstring);
283  initial.add_sample(idstate, prob, 0.0);
284 
285  input_initial >> line;
286  }
287 
288  shared_ptr<const MDP> csmdp = make_shared<const MDP>(std::move(mdp));
289  return make_unique<T>(csmdp, state2obs, initial);
290 
291  }
292  template<typename T = MDPI>
293  static unique_ptr<T> from_csv_file(const string& input_mdp,
294  const string& input_state2obs,
295  const string& input_initial,
296  bool headers = true){
297  // open files
298  ifstream ifs_mdp(input_mdp),
299  ifs_state2obs(input_state2obs),
300  ifs_initial(input_initial);
301 
302  // transfer method call
303  return from_csv<T>(ifs_mdp, ifs_state2obs, ifs_initial, headers);
304  }
305 protected:
306 
308  shared_ptr<const MDP> mdp;
314  long obscount;
317 
322  static void check_parameters(const MDP& mdp, const indvec& state2observ, const Transition& initial){
323  // *** check consistency of provided parameters ***
324  // check that the number of state2observ coefficients it correct
325  if(mdp.state_count() != state2observ.size())
326  throw invalid_argument("Number of observation indexes must match the number of states.");
327  // check that the observation indexes are not negative
328  if(state2observ.size() == 0)
329  throw invalid_argument("Cannot have empty observations.");
330  if(*min_element(state2observ.begin(), state2observ.end()) < 0)
331  throw invalid_argument("Observation indexes must be non-negative");
332  // check then initial transition
333  if(initial.max_index() >= (long) mdp.state_count())
334  throw invalid_argument("An initial transition to a non-existent state.");
335  if(!initial.is_normalized())
336  throw invalid_argument("The initial transition must be normalized.");
337  }
338 };
339 
340 
345 class MDPI_R : public MDPI{
346 
347 public:
348 
353  MDPI_R(const shared_ptr<const MDP>& mdp, const indvec& state2observ, const Transition& initial)
354  : MDPI(mdp, state2observ, initial), robust_mdp(), state2outcome(mdp->state_count(),-1){
355  initialize_robustmdp();
356  }
357 
362  MDPI_R(const MDP& mdp, const indvec& state2observ, const Transition& initial)
363  : MDPI(mdp, state2observ, initial), robust_mdp(), state2outcome(mdp.state_count(),-1){
364  initialize_robustmdp();
365  }
366 
367  const RMDP& get_robust_mdp() const {
369  return robust_mdp;
370  };
371 
378  void update_importance_weights(const numvec& weights){
379  if(weights.size() != state_count()){
380  throw invalid_argument("Size of distribution must match the number of states.");
381  }
382 
383  // loop over all mdp states and set weights
384  for(size_t i : indices(weights)){
385  const auto rmdp_stateid = state2observ[i];
386  const auto rmdp_outcomeid = state2outcome[i];
387 
388  // loop over all actions
389  auto& rstate = robust_mdp.get_state(rmdp_stateid);
390  for(size_t ai : indices(rstate)){
391  rstate.get_action(ai).set_distribution(rmdp_outcomeid, weights[i]);
392  }
393  }
394 
395  // now normalize the weights to they sum to one
396  for(size_t si : indices(robust_mdp)){
397  auto& s = robust_mdp.get_state(si);
398  for(size_t ai : indices(s)){
399  auto& a = s.get_action(ai);
400  // check if the distribution sums to 0 (not visited)
401  const numvec& dist = a.get_distribution();
402  if(accumulate(dist.begin(), dist.end(), 0.0) > 0.0){
403  a.normalize_distribution();
404  }
405  else{
406  // just set it to be uniform
407  a.uniform_distribution();
408  }
409  }
410  }
411  }
412 
428  indvec solve_reweighted(long iterations, prec_t discount, const indvec& initobspol = indvec(0)){
429  if(initobspol.size() > 0 && initobspol.size() != obs_count()){
430  throw invalid_argument("Initial policy must be defined for all observations.");
431  }
432 
433  indvec obspol(initobspol); // return observation policy
434  if(obspol.size() == 0){
435  obspol.resize(obs_count(),0);
436  }
437  indvec statepol(state_count(),0); // state policy that corresponds to the observation policy
438  obspol2statepol(obspol,statepol);
439 
440  // map the initial distribution to observations in order to evaluate the return
441  const Transition oinitial = transition2obs(initial);
442 
443  for(auto iter : range(0l, iterations)){
444  (void) iter; // to remove the warning
445 
446  // compute state distribution
447  numvec importanceweights = occfreq_mat(*mdp, initial, discount, statepol);
448  // update importance weights
449  update_importance_weights(importanceweights);
450  // compute solution of the robust MDP with the new weights
451  auto&& s = mpi_jac(robust_mdp, discount);
452 
453  // update the policy for the underlying states
454  obspol = s.policy;
455  // map the observation policy to the individual states
456  obspol2statepol(obspol, statepol);
457  }
458  return obspol;
459  }
460 
477  indvec solve_robust(long iterations, prec_t threshold, prec_t discount, const indvec& initobspol = indvec(0)){
478 
479  if(initobspol.size() > 0 && initobspol.size() != obs_count()){
480  throw invalid_argument("Initial policy must be defined for all observations.");
481  }
482 
483  indvec obspol(initobspol); // return observation policy
484  if(obspol.size() == 0){
485  obspol.resize(obs_count(),0);
486  }
487  indvec statepol(state_count(),0); // state policy that corresponds to the observation policy
488  obspol2statepol(obspol,statepol);
489 
490  for(auto iter : range(0l, iterations)){
491  (void) iter; // to remove the warning
492 
493  // compute state distribution
494  numvec&& importanceweights = occfreq_mat(*mdp, initial, discount, statepol);
495 
496  // update importance weights
497  update_importance_weights(importanceweights);
498 
499  // compute solution of the robust MDP with the new weights
500  auto&& s = mpi_jac(robust_mdp, discount, numvec(0), uniform_nature(robust_mdp, robust_l1, threshold));
501 
502  // update the policy for the underlying states
503  obspol = s.policy;
504 
505  // map the observation policy to the individual states
506  obspol2statepol(obspol, statepol);
507 
508  }
509  return obspol;
510 
511  }
512 
513  static unique_ptr<MDPI_R> from_csv(istream& input_mdp, istream& input_state2obs,
514  istream& input_initial, bool headers = true){
515 
516  return MDPI::from_csv<MDPI_R>(input_mdp,input_state2obs,input_initial, headers);
517  };
518 
520  static unique_ptr<MDPI_R> from_csv_file(const string& input_mdp,
521  const string& input_state2obs,
522  const string& input_initial,
523  bool headers = true){
524  return MDPI::from_csv_file<MDPI_R>(input_mdp,input_state2obs,input_initial, headers);
525  };
526 
527 protected:
535  // Determine the number of state2observ
536  auto obs_count = *max_element(state2observ.begin(), state2observ.end()) + 1;
537 
538  // keep track of the number of outcomes for each
539  indvec outcome_count(obs_count, 0);
540 
541  for(size_t state_index : indices(*mdp)){
542  auto obs = state2observ[state_index];
543 
544  // make sure to at least create a terminal state when there are no actions for it
545  robust_mdp.create_state(obs);
546 
547  // maps the transitions
548  for(auto action_index : range(0l, action_counts[obs])){
549  // get original MDP transition
550  const Transition& old_tran = mdp->get_state(state_index).get_action(action_index).get_outcome();
551  // create a new transition
552  Transition& new_tran = robust_mdp.create_state(obs).create_action(action_index).create_outcome(outcome_count[obs]);
553 
554  // copy the original transitions (they are automatically consolidated while being added)
555  for(auto k : range((size_t) 0, old_tran.size())){
556  new_tran.add_sample(state2observ[old_tran.get_indices()[k]],
557  old_tran.get_probabilities()[k],
558  old_tran.get_rewards()[k]);
559  }
560 
561  }
562  state2outcome[state_index] = outcome_count[obs]++;
563  }
564  }
565 };
566 
567 }}
numvec occfreq_mat(const GRMDP< SType > &rmdp, const Transition &init, prec_t discount, const Policies &policies)
Computes occupancy frequencies using matrix representation of transition probabilities.
Definition: occupancies.hpp:124
Transition transition2obs(const Transition &tran)
Converts a transition from states to observations, adding probabilities of individual states...
Definition: ImMDP.hpp:128
indvec obspol2statepol(const indvec &obspol) const
Converts a policy defined in terms of observations to a policy defined in terms of states...
Definition: ImMDP.hpp:102
void to_csv_file(const string &output_mdp, const string &output_state2obs, const string &output_initial, bool headers=true) const
Saves the MDPI to a set of 3 csv files, for transitions, observations, and the initial distribution...
Definition: ImMDP.hpp:219
static unique_ptr< T > from_csv(istream &input_mdp, istream &input_state2obs, istream &input_initial, bool headers=true)
Loads an MDPI from a set of 3 csv files, for transitions, observations, and the initial distribution...
Definition: ImMDP.hpp:245
Represents an MDP with implementability constraints.
Definition: ImMDP.hpp:37
PolicyNature< T > uniform_nature(size_t statecount, NatureResponse< T > nature, T threshold)
A helper function that simply copies a nature specification across all states.
Definition: robust_values.hpp:318
Transition get_initial() const
Initial distribution of MDP.
Definition: ImMDP.hpp:146
long obscount
number of observations
Definition: ImMDP.hpp:314
A general robust Markov decision process.
Definition: RMDP.hpp:182
const indvec & get_indices() const
Indices with positive probabilities.
Definition: Transition.hpp:323
Transition initial
initial distribution
Definition: ImMDP.hpp:312
long max_index() const
Returns the maximal indexes involved in the transition.
Definition: Transition.hpp:262
shared_ptr< const MDP > get_mdp()
Internal MDP representation.
Definition: ImMDP.hpp:143
double prec_t
Default precision used throughout the code.
Definition: definitions.hpp:25
auto mpi_jac(const GRMDP< SType > &mdp, prec_t discount, const numvec &valuefunction=numvec(0), const ResponseType &response=PolicyDeterministic(), unsigned long iterations_pi=MAXITER, prec_t maxresidual_pi=SOLPREC, unsigned long iterations_vi=MAXITER, prec_t maxresidual_vi=SOLPREC/2, bool print_progress=false)
Modified policy iteration using Jacobi value iteration in the inner loop.
Definition: values.hpp:405
vector< prec_t > numvec
Default numerical vector.
Definition: definitions.hpp:28
indvec state2outcome
Maps the index of the mdp state to the index of the observation within the state corresponding to the...
Definition: ImMDP.hpp:532
shared_ptr< const MDP > mdp
the underlying MDP
Definition: ImMDP.hpp:308
indvec action_counts
number of actions for each observation
Definition: ImMDP.hpp:316
MDPI_R(const MDP &mdp, const indvec &state2observ, const Transition &initial)
Calls the base constructor and also constructs the corresponding robust MDP.
Definition: ImMDP.hpp:362
MDPI(const MDP &mdp, const indvec &state2observ, const Transition &initial)
Constructs the MDP with implementability constraints.
Definition: ImMDP.hpp:88
vec_scal_t robust_l1(const numvec &v, const numvec &p, prec_t threshold)
L1 robust response.
Definition: robust_values.hpp:44
Definition: values.hpp:252
const numvec & get_probabilities() const
Returns list of positive probabilities for indexes returned by get_indices.
Definition: Transition.hpp:332
size_t state_count() const
Number of states.
Definition: RMDP.hpp:225
void to_csv(ostream &output_mdp, ostream &output_state2obs, ostream &output_initial, bool headers=true) const
Saves the MDPI to a set of 3 csv files, for transitions, observations, and the initial distribution...
Definition: ImMDP.hpp:186
void initialize_robustmdp()
Constructs a robust version of the implementable MDP.
Definition: ImMDP.hpp:534
static void check_parameters(const MDP &mdp, const indvec &state2observ, const Transition &initial)
Checks whether the parameters are correct.
Definition: ImMDP.hpp:322
indvec solve_robust(long iterations, prec_t threshold, prec_t discount, const indvec &initobspol=indvec(0))
Uses a robust MDP formulation to solve the MDPI.
Definition: ImMDP.hpp:477
Main namespace for algorithms that operate on MDPs and RMDPs.
Definition: occupancies.hpp:8
MDPI_R(const shared_ptr< const MDP > &mdp, const indvec &state2observ, const Transition &initial)
Calls the base constructor and also constructs the corresponding robust MDP.
Definition: ImMDP.hpp:353
size_t size() const
Returns the number of target states with non-zero transition probabilities.
Definition: Transition.hpp:249
Represents sparse transition probabilities and rewards from a single state.
Definition: Transition.hpp:31
constexpr prec_t SOLPREC
Default solution precision.
Definition: definitions.hpp:40
Model & from_csv(Model &mdp, istream &input, bool header=true)
Loads an GRMDP definition from a simple csv file.
Definition: modeltools.hpp:82
bool is_normalized() const
Definition: Transition.hpp:186
SType & create_state(long stateid)
Assures that the MDP state exists and if it does not, then it is created.
Definition: RMDP.hpp:211
const numvec & get_rewards() const
Rewards for indices with positive probabilities returned by get_indices.
Definition: Transition.hpp:337
const RMDP & get_robust_mdp() const
Definition: ImMDP.hpp:367
void update_importance_weights(const numvec &weights)
Updates the weights on outcomes in the robust MDP based on the state weights provided.
Definition: ImMDP.hpp:378
Model & from_csv_file(Model &mdp, const string &filename, bool header=true)
Loads the transition probabilities and rewards from a CSV file.
Definition: modeltools.hpp:127
void obspol2statepol(const indvec &obspol, indvec &statepol) const
Converts a policy defined in terms of observations to a policy defined in terms of states...
Definition: ImMDP.hpp:114
RMDP robust_mdp
Robust representation of the MDPI.
Definition: ImMDP.hpp:525
AType & create_action(long actionid)
Creates an action given by actionid if it does not exists.
Definition: State.hpp:66
indvec solve_reweighted(long iterations, prec_t discount, const indvec &initobspol=indvec(0))
Uses a simple iterative algorithm to solve the MDPI.
Definition: ImMDP.hpp:428
static unique_ptr< MDPI_R > from_csv_file(const string &input_mdp, const string &input_state2obs, const string &input_initial, bool headers=true)
Loads the class from an set of CSV files.
Definition: ImMDP.hpp:520
vector< long > indvec
Default index vector.
Definition: definitions.hpp:31
An MDP with implementability constraints.
Definition: ImMDP.hpp:345
constexpr unsigned long MAXITER
Default number of iterations.
Definition: definitions.hpp:43
Main namespace which includes modeling a solving functionality.
Definition: Action.hpp:18
prec_t total_return(prec_t discount, prec_t precision=SOLPREC) const
Computes a return of an observation policy.
Definition: ImMDP.hpp:172
MDPI(const shared_ptr< const MDP > &mdp, const indvec &state2observ, const Transition &initial)
Constructs the MDP with implementability constraints.
Definition: ImMDP.hpp:54
indvec state2observ
maps index of a state to the index of the observation
Definition: ImMDP.hpp:310
void add_sample(long stateid, prec_t probability, prec_t reward)
Adds a single transitions probability to the existing probabilities.
Definition: Transition.hpp:116
indvec random_policy(random_device::result_type seed=random_device{}())
Constructs a random observation policy.
Definition: ImMDP.hpp:149