Work through these 25 challenges before moving to Phase 1 (Math for RL) . Each challenge is small and self-contained. If you get stuck, use the hint; only check the solution after a genuine attempt.
Try each challenge in the interactive REPL below it, or in a .py file on your machine.
Level 1 — Basics (Challenges 1–8)# Challenge 1 — Print with variables# Set name = "DQN" and version = 3. Print: "Algorithm: DQN version 3" using an f-string.
Try it — edit and run (Shift+Enter)
Load Python REPL Runs in your browser — no install needed
Solution 1
2
3
name = "DQN"
version = 3
print ( f "Algorithm: { name } version { version } " )
Challenge 2 — Type check# gamma = "0.9" is a string, not a float. Convert it to a float, square it, and print the result (expected: 0.81).
Try it — edit and run (Shift+Enter)
Load Python REPL Runs in your browser — no install needed
Solution 1
2
3
gamma = "0.9"
gamma = float ( gamma )
print ( gamma ** 2 ) # 0.81
Challenge 3 — Conditional reward# Set reward = -0.5. Print "Penalty" if the reward is negative, "Neutral" if zero, "Bonus" if positive.
Try it — edit and run (Shift+Enter)
Load Python REPL Runs in your browser — no install needed
Solution 1
2
3
4
5
6
7
reward = - 0.5
if reward < 0 :
print ( "Penalty" )
elif reward == 0 :
print ( "Neutral" )
else :
print ( "Bonus" )
Challenge 4 — Count steps# Use a while loop that starts with step = 0 and increments until step == 5. Print each step. Then print "Done".
Try it — edit and run (Shift+Enter)
Load Python REPL Runs in your browser — no install needed
Solution 1
2
3
4
5
step = 0
while step < 5 :
print ( "Step" , step )
step += 1
print ( "Done" )
Challenge 5 — Sum rewards# Use a for loop to compute the sum of rewards = [0.5, 0.3, -0.2, 1.0, 0.0]. Do not use sum(). Print the result (expected: 1.6).
Try it — edit and run (Shift+Enter)
Load Python REPL Runs in your browser — no install needed
Solution 1
2
3
4
5
rewards = [ 0.5 , 0.3 , - 0.2 , 1.0 , 0.0 ]
total = 0
for r in rewards :
total += r
print ( f "Total: { total } " ) # 1.6
Challenge 6 — Function: clamp# Write clamp(x, lo, hi) that returns lo if x < lo, hi if x > hi, else x. (Used in PPO to clip ratios.) Test with clamp(1.5, 0.8, 1.2) → 1.2.
Try it — edit and run (Shift+Enter)
Load Python REPL Runs in your browser — no install needed
Solution 1
2
3
4
5
6
7
8
9
10
def clamp ( x , lo , hi ):
if x < lo :
return lo
if x > hi :
return hi
return x
print ( clamp ( 1.5 , 0.8 , 1.2 )) # 1.2
print ( clamp ( 0.5 , 0.8 , 1.2 )) # 0.8
print ( clamp ( 1.0 , 0.8 , 1.2 )) # 1.0
Challenge 7 — Build a trajectory list# Create an empty list trajectory. Append 5 tuples (step, step * 2, step * 0.1) representing (step, state, reward) for steps 0–4. Print the list.
Try it — edit and run (Shift+Enter)
Load Python REPL Runs in your browser — no install needed
Solution 1
2
3
4
trajectory = []
for step in range ( 5 ):
trajectory . append (( step , step * 2 , step * 0.1 ))
print ( trajectory )
Challenge 8 — Dict lookup with default# Create Q = {("s0", "up"): 0.5, ("s0", "down"): -0.2}. Write a function get_q(Q, state, action) that returns the Q-value if it exists, else 0.0. Test with a missing key like ("s1", "up").
Try it — edit and run (Shift+Enter)
Load Python REPL Runs in your browser — no install needed
Solution 1
2
3
4
5
6
7
Q = {( "s0" , "up" ): 0.5 , ( "s0" , "down" ): - 0.2 }
def get_q ( Q , state , action ):
return Q . get (( state , action ), 0.0 )
print ( get_q ( Q , "s0" , "up" )) # 0.5
print ( get_q ( Q , "s1" , "up" )) # 0.0
Level 2 — Combining skills (Challenges 9–17)# Challenge 9 — Coin flip fraction# Import random. Simulate flipping a fair coin 1000 times (random.choice([0, 1])). Print the fraction of heads (expected ≈ 0.5). Set random.seed(42) first.
Try it — edit and run (Shift+Enter)
Load Python REPL Runs in your browser — no install needed
Solution 1
2
3
4
5
import random
random . seed ( 42 )
n = 1000
heads = sum ( random . choice ([ 0 , 1 ]) for _ in range ( n ))
print ( f "Fraction of heads: { heads / n : .3f } " ) # ≈ 0.5
Challenge 10 — Die roll counts# Roll a 6-sided die 600 times. Store counts in a dict {face: count}. Print each face and its count (expected ≈ 100 each).
Try it — edit and run (Shift+Enter)
Load Python REPL Runs in your browser — no install needed
Solution 1
2
3
4
5
6
7
8
import random
random . seed ( 0 )
counts = {}
for _ in range ( 600 ):
face = random . randint ( 1 , 6 )
counts [ face ] = counts . get ( face , 0 ) + 1
for face , count in sorted ( counts . items ()):
print ( f "Face { face } : { count } " )
Challenge 11 — Discounted return# Write discounted_return(rewards, gamma=0.9) without using NumPy. Test:
[0, 0, 1] → 0.81[1, 0, 0] → 1.0[1, 1, 1] → ≈2.71
Try it — edit and run (Shift+Enter)
Load Python REPL Runs in your browser — no install needed
Solution 1
2
3
4
5
6
def discounted_return ( rewards , gamma = 0.9 ):
return sum ( gamma ** t * r for t , r in enumerate ( rewards ))
print ( discounted_return ([ 0 , 0 , 1 ])) # 0.81
print ( discounted_return ([ 1 , 0 , 0 ])) # 1.0
print ( f " { discounted_return ([ 1 , 1 , 1 ]) : .4f } " ) # 2.7100
Challenge 12 — Find the max Q-action# Given Q_s = {"up": 0.3, "down": -0.1, "left": 0.7, "right": 0.2}, write best_action(Q_s) that returns the key with the highest value. Expected: "left".
Try it — edit and run (Shift+Enter)
Load Python REPL Runs in your browser — no install needed
Solution 1
2
3
4
5
6
Q_s = { "up" : 0.3 , "down" : - 0.1 , "left" : 0.7 , "right" : 0.2 }
def best_action ( Q_s ):
return max ( Q_s , key = Q_s . get )
print ( best_action ( Q_s )) # left
Challenge 13 — Valid grid moves# Write valid_moves(row, col, n=4) that returns a list of valid actions (0=up, 1=down, 2=left, 3=right) for a cell in an n×n grid (actions that stay in bounds).
Try it — edit and run (Shift+Enter)
Load Python REPL Runs in your browser — no install needed
Solution 1
2
3
4
5
6
7
8
9
10
11
12
def valid_moves ( row , col , n = 4 ):
deltas = { 0 : ( - 1 , 0 ), 1 : ( 1 , 0 ), 2 : ( 0 , - 1 ), 3 : ( 0 , 1 )}
valid = []
for action , ( dr , dc ) in deltas . items ():
nr , nc = row + dr , col + dc
if 0 <= nr < n and 0 <= nc < n :
valid . append ( action )
return valid
print ( valid_moves ( 0 , 0 )) # [1, 3]
print ( valid_moves ( 2 , 2 )) # [0, 1, 2, 3]
print ( valid_moves ( 3 , 3 )) # [0, 2]
Challenge 14 — Episode simulator# Write simulate_episode(max_steps=10) that:
Starts at position 0 Each step: move +1 with 70% prob, -1 with 30% prob Stop if position ≥ 5 or ≤ -5, or after max_steps Return the list of positions visited
Try it — edit and run (Shift+Enter)
Load Python REPL Runs in your browser — no install needed
Solution 1
2
3
4
5
6
7
8
9
10
11
12
13
14
import random
random . seed ( 7 )
def simulate_episode ( max_steps = 10 ):
pos = 0
positions = [ pos ]
for _ in range ( max_steps ):
pos += 1 if random . random () < 0.7 else - 1
positions . append ( pos )
if abs ( pos ) >= 5 :
break
return positions
print ( simulate_episode ())
Challenge 15 — Running mean# Write running_mean(values) that returns a list where the i-th element is the average of values[0:i+1] (used for smoothing reward curves).
Try it — edit and run (Shift+Enter)
Load Python REPL Runs in your browser — no install needed
Solution 1
2
3
4
5
6
7
8
9
def running_mean ( values ):
result = []
total = 0
for i , v in enumerate ( values ):
total += v
result . append ( total / ( i + 1 ))
return result
print ( running_mean ([ 1 , 3 , 2 , 4 ])) # [1.0, 2.0, 2.0, 2.5]
Challenge 16 — Epsilon-greedy# Write epsilon_greedy(q_values, epsilon=0.1) that returns a random action index with probability epsilon, else the greedy action index.
Try it — edit and run (Shift+Enter)
Load Python REPL Runs in your browser — no install needed
Solution 1
2
3
4
5
6
7
8
9
10
import random
random . seed ( 1 )
def epsilon_greedy ( q_values , epsilon = 0.1 ):
if random . random () < epsilon :
return random . randrange ( len ( q_values ))
return q_values . index ( max ( q_values ))
q = [ 0.2 , 0.7 , 0.1 ]
print ([ epsilon_greedy ( q , 0.0 ) for _ in range ( 5 )]) # [1,1,1,1,1]
Challenge 17 — Count action visits# Simulate 200 steps of epsilon-greedy with q = [0.2, 0.7, 0.1, 0.4] and epsilon=0.2. Count how many times each action was chosen. Print the counts as a dict.
Try it — edit and run (Shift+Enter)
Load Python REPL Runs in your browser — no install needed
Solution 1
2
3
4
5
6
7
8
import random
random . seed ( 42 )
q = [ 0.2 , 0.7 , 0.1 , 0.4 ]
counts = { i : 0 for i in range ( len ( q ))}
for _ in range ( 200 ):
action = epsilon_greedy ( q , 0.2 )
counts [ action ] += 1
print ( counts )
Level 3 — Small RL programs (Challenges 18–25)# Challenge 18 — Incremental mean# Write incremental_mean_update(Q_old, n, reward) that returns the new mean after seeing reward as the n-th observation: Q_new = Q_old + (reward - Q_old) / n. Test: start with Q=0, update with rewards [1.2, 0.8, 1.0, 1.4]. Final mean should be 1.1.
Try it — edit and run (Shift+Enter)
Load Python REPL Runs in your browser — no install needed
Solution 1
2
3
4
5
6
7
def incremental_mean_update ( Q_old , n , reward ):
return Q_old + ( reward - Q_old ) / n
Q = 0.0
for n , r in enumerate ([ 1.2 , 0.8 , 1.0 , 1.4 ], 1 ):
Q = incremental_mean_update ( Q , n , r )
print ( f "Final mean: { Q : .2f } " ) # 1.10
Challenge 19 — 3-armed bandit simulation# Simulate a 3-armed bandit for 300 steps with true means [0.3, 0.7, 0.1]. Use epsilon-greedy (ε=0.1) and incremental mean updates. Print the final Q estimates and which arm was pulled most.
Try it — edit and run (Shift+Enter)
Load Python REPL Runs in your browser — no install needed
Solution 1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
import random
random . seed ( 42 )
true_means = [ 0.3 , 0.7 , 0.1 ]
k = len ( true_means )
Q = [ 0.0 ] * k
N = [ 0 ] * k
epsilon = 0.1
for step in range ( 300 ):
if random . random () < epsilon :
action = random . randrange ( k )
else :
action = Q . index ( max ( Q ))
reward = random . gauss ( true_means [ action ], 1 )
N [ action ] += 1
Q [ action ] += ( reward - Q [ action ]) / N [ action ]
print ( "Q estimates:" , [ round ( q , 3 ) for q in Q ])
print ( "Pull counts:" , N )
print ( "Most pulled:" , Q . index ( max ( Q )))
Challenge 20 — Random walk value estimation# Simulate a 1D random walk: states 0–6, start at 3, walk left/right equally. Episode ends at 0 or 6 (reward +1 at 6, 0 at 0). Run 2000 episodes. Estimate V(s) = fraction of episodes from s that reached 6.
Try it — edit and run (Shift+Enter)
Load Python REPL Runs in your browser — no install needed
Solution 1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
import random
random . seed ( 0 )
def random_walk_episode ( start = 3 ):
s = start
traj = []
while s not in ( 0 , 6 ):
s += 1 if random . random () < 0.5 else - 1
r = 1 if s == 6 else 0
traj . append (( s , r ))
return traj
returns = { s : [] for s in range ( 1 , 6 )}
for _ in range ( 2000 ):
ep = random_walk_episode ()
for i , ( s , r ) in enumerate ( ep ):
if s in returns :
G = sum ( rr for _ , rr in ep [ i :])
returns [ s ] . append ( G )
for s in range ( 1 , 6 ):
v = sum ( returns [ s ]) / len ( returns [ s ]) if returns [ s ] else 0
print ( f "V( { s } ) = { v : .3f } (true = { s / 6 : .3f } )" )
Challenge 21 — Find the bug (1)# This function is supposed to compute discounted return but has a bug. Find and fix it.
1
2
3
4
5
def broken_return ( rewards , gamma = 0.9 ):
G = 0
for t , r in enumerate ( rewards ):
G += r # bug: missing discount
return G
Try it — edit and run (Shift+Enter)
Load Python REPL Runs in your browser — no install needed
Solution 1
2
3
4
5
6
7
8
def fixed_return ( rewards , gamma = 0.9 ):
G = 0
for t , r in enumerate ( rewards ):
G += gamma ** t * r # apply discount
return G
print ( fixed_return ([ 0 , 0 , 1 ])) # 0.81
print ( fixed_return ([ 1 , 1 , 1 ])) # 2.71
Challenge 22 — Find the bug (2)# This epsilon-greedy function explores when it should exploit and vice versa. Fix it.
1
2
3
4
5
def bad_epsilon_greedy ( Q , epsilon = 0.1 ):
import random
if random . random () > epsilon : # bug: should be <
return random . randrange ( len ( Q ))
return Q . index ( max ( Q ))
Try it — edit and run (Shift+Enter)
Load Python REPL Runs in your browser — no install needed
Solution 1
2
3
4
5
6
7
8
9
10
import random
def fixed_eg ( Q , epsilon = 0.1 ):
if random . random () < epsilon : # < not >
return random . randrange ( len ( Q ))
return Q . index ( max ( Q ))
random . seed ( 5 )
Q = [ 0.1 , 0.9 , 0.3 ]
print ([ fixed_eg ( Q , 0.0 ) for _ in range ( 5 )]) # [1,1,1,1,1]
Challenge 23 — Q-table update# Implement one step of Q-learning: Q[s][a] += alpha * (r + gamma * max(Q[s_next]) - Q[s][a]).
Given:
Q = {"A": [0.0, 0.5], "B": [0.3, 0.2]}Transition: s="A", a=0, r=1, s_next="B", done=False alpha=0.1, gamma=0.9Print the updated Q["A"][0].
Try it — edit and run (Shift+Enter)
Load Python REPL Runs in your browser — no install needed
Solution 1
2
3
4
5
6
7
Q = { "A" : [ 0.0 , 0.5 ], "B" : [ 0.3 , 0.2 ]}
alpha , gamma = 0.1 , 0.9
s , a , r , s_next , done = "A" , 0 , 1 , "B" , False
target = r + gamma * max ( Q [ s_next ]) if not done else r
Q [ s ][ a ] += alpha * ( target - Q [ s ][ a ])
print ( f "Q[A][0] = { Q [ 'A' ][ 0 ] : .4f } " ) # 0.1 * (1 + 0.9*0.3 - 0) = 0.127
Challenge 24 — Multi-episode return tracking# Run 50 episodes of a random agent on a 3×3 gridworld (use your step() function from Phase 0). Each episode: start at (0,0), take random actions, stop at (2,2) or after 20 steps. Collect the total (undiscounted) reward per episode. Print the mean and max episode return.
Try it — edit and run (Shift+Enter)
Load Python REPL Runs in your browser — no install needed
Solution 1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
import random
random . seed ( 0 )
def step ( state , action ):
row , col = state
moves = { 0 :( - 1 , 0 ), 1 :( 1 , 0 ), 2 :( 0 , - 1 ), 3 :( 0 , 1 )}
dr , dc = moves [ action ]
nr , nc = row + dr , col + dc
if not ( 0 <= nr <= 2 and 0 <= nc <= 2 ): return state , - 1 , False
if ( nr , nc ) == ( 2 , 2 ): return ( 2 , 2 ), 1 , True
return ( nr , nc ), 0 , False
episode_returns = []
for _ in range ( 50 ):
state , G , done = ( 0 , 0 ), 0 , False
for _ in range ( 20 ):
if done : break
a = random . randint ( 0 , 3 )
state , r , done = step ( state , a )
G += r
episode_returns . append ( G )
print ( f "Mean return: { sum ( episode_returns ) / len ( episode_returns ) : .2f } " )
print ( f "Max return: { max ( episode_returns ) } " )
Challenge 25 — Full mini-agent# Implement a complete bandit agent: 5 arms, true means drawn from Normal(0, 1), run 500 steps. Use epsilon-greedy (ε=0.1) and incremental updates. Report: (1) estimated Q for each arm, (2) which arm the agent converged to, (3) whether it is the true best arm.
Try it — edit and run (Shift+Enter)
Load Python REPL Runs in your browser — no install needed
Solution 1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
import random
random . seed ( 99 )
k = 5
true_means = [ random . gauss ( 0 , 1 ) for _ in range ( k )]
Q = [ 0.0 ] * k
N = [ 0 ] * k
epsilon = 0.1
for _ in range ( 500 ):
if random . random () < epsilon :
a = random . randrange ( k )
else :
a = Q . index ( max ( Q ))
r = random . gauss ( true_means [ a ], 1 )
N [ a ] += 1
Q [ a ] += ( r - Q [ a ]) / N [ a ]
print ( "True means: " , [ round ( m , 3 ) for m in true_means ])
print ( "Q estimates: " , [ round ( q , 3 ) for q in Q ])
print ( "Pull counts: " , N )
print ( "Best (true): " , true_means . index ( max ( true_means )))
print ( "Best (learned):" , Q . index ( max ( Q )))
Checklist# Before moving to Phase 1, confirm:
When all four are checked, proceed to Phase 1 — Math for RL .