From e8ac466567ad724e3f21e80b0b5ed078487498a4 Mon Sep 17 00:00:00 2001 From: Donato Meoli Date: Sat, 27 Jun 2026 02:12:56 +0200 Subject: [PATCH] Fix terminal Q-value in QLearningAgent/SARSA (use r, not r1) On entering a previously-terminal state, the agent set Q[s, None] = r1 (the reward of the *new* percept) instead of r (the stored reward received at the terminal state), per AIMA Fig 21.8 (Q[s, None] <- r). This made terminal Q-values wrong (e.g. ~-0.04 / noisy instead of +1/-1) and yielded incorrect policies. Verified: terminal Q-values now converge to ~+1 / ~-1. Fixes #1247. --- reinforcement_learning.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/reinforcement_learning.py b/reinforcement_learning.py index a6f978fc6..9c714adb8 100644 --- a/reinforcement_learning.py +++ b/reinforcement_learning.py @@ -290,7 +290,7 @@ def __call__(self, percept): actions_in_state = self.actions_in_state if s in terminals: - Q[s, None] = r1 + Q[s, None] = r if s is not None: Nsa[s, a] += 1 Q[s, a] += alpha(Nsa[s, a]) * (r + gamma * max(Q[s1, a1] @@ -330,7 +330,7 @@ def __call__(self, percept): a1 = max(actions_in_state(s1), key=lambda a2: self.f(Q[s1, a2], Nsa[s1, a2])) if s in terminals: - Q[s, None] = r1 + Q[s, None] = r if s is not None: Nsa[s, a] += 1 # on-policy update: bootstrap on the actually-chosen next action a1