From e8ac466567ad724e3f21e80b0b5ed078487498a4 Mon Sep 17 00:00:00 2001
From: Donato Meoli <donato.meoli.95@gmail.com>
Date: Sat, 27 Jun 2026 02:12:56 +0200
Subject: [PATCH] Fix terminal Q-value in QLearningAgent/SARSA (use r, not r1)

On entering a previously-terminal state, the agent set Q[s, None] = r1 (the
reward of the *new* percept) instead of r (the stored reward received at the
terminal state), per AIMA Fig 21.8 (Q[s, None] <- r). This made terminal
Q-values wrong (e.g. ~-0.04 / noisy instead of +1/-1) and yielded incorrect
policies. Verified: terminal Q-values now converge to ~+1 / ~-1. Fixes #1247.
---
 reinforcement_learning.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/reinforcement_learning.py b/reinforcement_learning.py
index a6f978fc6..9c714adb8 100644
--- a/reinforcement_learning.py
+++ b/reinforcement_learning.py
@@ -290,7 +290,7 @@ def __call__(self, percept):
         actions_in_state = self.actions_in_state
 
         if s in terminals:
-            Q[s, None] = r1
+            Q[s, None] = r
         if s is not None:
             Nsa[s, a] += 1
             Q[s, a] += alpha(Nsa[s, a]) * (r + gamma * max(Q[s1, a1]
@@ -330,7 +330,7 @@ def __call__(self, percept):
         a1 = max(actions_in_state(s1), key=lambda a2: self.f(Q[s1, a2], Nsa[s1, a2]))
 
         if s in terminals:
-            Q[s, None] = r1
+            Q[s, None] = r
         if s is not None:
             Nsa[s, a] += 1
             # on-policy update: bootstrap on the actually-chosen next action a1