Tag: bellman equation

  • From Certain to Uncertain | Stochastic Bellman Equation Made Easy

    From Certain to Uncertain | Stochastic Bellman Equation Made Easy

    In the video below we will go over how to calculate value for a state when the actions are probabilistic.

    If you wondered how do I get the values for all states, here is the code snippet for it.

    import numpy as np
    import matplotlib.pyplot as plt
    from typing import List, Tuple
    
    class StochasticGridWorld:
        def __init__(self, size: int = 3, gamma: float = 0.9):
            self.size = size
            self.gamma = gamma
            # Initialize states
            self.values = np.zeros((size, size))
            self.values[0, 2] = -1  # Cat
            self.values[2, 2] = 1   # Cheese
            
            # Track value history for convergence visualization
            self.value_history = {(i, j): [] for i in range(size) for j in range(size)}
            
            # Movement probabilities
            self.p_intended = 0.5  # Probability of moving in intended direction
            self.p_random = 0.5 / 4  # Split remaining probability among all directions
            
        def get_next_state(self, current_state: Tuple[int, int], 
                           action: Tuple[int, int]) -> Tuple[int, int]:
            """Calculate next state given current state and action"""
            next_i = current_state[0] + action[0]
            next_j = current_state[1] + action[1]
            
            # Check if next state is within grid
            if 0 <= next_i < self.size and 0 <= next_j < self.size:
                return (next_i, next_j)
            return current_state
        
        def get_possible_actions(self) -> List[Tuple[int, int]]:
            """Return all possible actions as (dx, dy)"""
            return [(0, 1), (0, -1), (1, 0), (-1, 0)]  # Right, Left, Down, Up
        
        def calculate_state_value(self, state: Tuple[int, int]) -> float:
            """Calculate value for a given state considering all actions"""
            if state == (0, 2) or state == (2, 2):  # Terminal states
                return self.values[state]
            
            max_value = float('-inf')
            actions = self.get_possible_actions()
            
            for action in actions:
                value = 0 # We know this as the immediate reward is 0
                # Intended movement
                next_state = self.get_next_state(state, action)
                value += self.p_intended * self.values[next_state]
                
                # Random movements
                for random_action in actions:
                    random_next_state = self.get_next_state(state, random_action)
                    value += self.p_random * self.values[random_next_state]
                
                value = self.gamma * value  # Apply discount factor
                max_value = max(max_value, value)
                
            return max_value
        
        def value_iteration(self, num_iterations: int = 100, 
                           threshold: float = 1e-4) -> np.ndarray:
            """Perform value iteration and store history"""
            for iteration in range(num_iterations):
                delta = 0
                new_values = np.copy(self.values)
                
                for i in range(self.size):
                    for j in range(self.size):
                        if (i, j) not in [(0, 2), (2, 2)]:  # Skip terminal states
                            old_value = self.values[i, j]
                            new_values[i, j] = self.calculate_state_value((i, j))
                            delta = max(delta, abs(old_value - new_values[i, j]))
                            self.value_history[(i, j)].append(new_values[i, j])
                
                self.values = new_values
                
                # Check convergence
                if delta < threshold:
                    print(f"Converged after {iteration + 1} iterations")
                    break
            
            return self.values
        
        def plot_convergence(self):
            """Plot value convergence for each non-terminal state"""
            plt.figure(figsize=(12, 8))
            for state, history in self.value_history.items():
                if state not in [(0, 2), (2, 2)]:  # Skip terminal states
                    plt.plot(history, label=f'State {state}')
            
            plt.title('Value Convergence Over Iterations')
            plt.xlabel('Iteration')
            plt.ylabel('State Value')
            plt.legend()
            plt.grid(True)
            plt.show()
    
    # Run the simulation
    grid_world = StochasticGridWorld()
    final_values = grid_world.value_iteration(num_iterations=100)
    
    print("\nFinal Values:")
    print(np.round(final_values, 3))
    
  • How Does a Mouse Find Cheese? | Bellman Equation Made Simple

    In the video we will explain how the Bellman Equation works in a deterministic world.

    Here is the code snippet you can use and run to verify the values of the state in the 3×3 grid world.

    def value_iteration(rewards, gamma=0.9, tolerance=1e-4, max_iterations=1000):
        # Initialize value matrix
        V = np.zeros_like(rewards, dtype=float)
        # Set terminal state values
        V[0, 2] = -1  # Cat state
        V[2, 2] = 1   # Cheese state
        
        for iteration in range(max_iterations):
            delta = 0  # Track maximum change
            V_prev = V.copy()  # Store previous values
            
            for i in range(3):
                for j in range(3):
                    # Skip terminal states
                    if (i == 0 and j == 2) or (i == 2 and j == 2):
                        continue
                        
                    # Get values of possible next states
                    possible_values = []
                    
                    # Check all possible moves (up, down, left, right)
                    # Up
                    if i > 0:
                        possible_values.append(V_prev[i-1, j])
                    # Down
                    if i < 2:
                        possible_values.append(V_prev[i+1, j])
                    # Left
                    if j > 0:
                        possible_values.append(V_prev[i, j-1])
                    # Right
                    if j < 2:
                        possible_values.append(V_prev[i, j+1])
                    
                    # Update value using Bellman equation
                    best_next_value = max(possible_values)
                    V[i, j] = rewards[i, j] + gamma * best_next_value
                    
                    # Update delta
                    delta = max(delta, abs(V[i, j] - V_prev[i, j]))
            
            # Check for convergence
            if delta < tolerance:
                print(f"Converged after {iteration + 1} iterations")
                break
        
        return V
    
    # Initialize rewards matrix
    rewards = np.zeros((3, 3))
    rewards[0, 2] = -1  # Cat state
    rewards[2, 2] = 1   # Cheese state
    
    # Run value iteration
    V = value_iteration(rewards, gamma=0.9)
    
    # Round the values for better readability
    np.set_printoptions(precision=3, suppress=True)
    print("\nFinal Value Function:")
    print(V)