1.现象描述在5×5 的网格是上智能体可上下左右移动。其中网格含终点4,4的奖励 1、陷阱22惩罚 - 1且每步有小惩罚。要求可视化价值矩阵和行动策略展示每个位置的最优移动方向。2.代码实现# -*- coding: utf-8 -*- import torch import numpy as np import matplotlib.pyplot as plt device torch.device(cuda if torch.cuda.is_available() else cpu) print(f系统检测正在使用 [{device}] 运行计算...) class BellmanVExplorer: def __init__(self, size5, gamma0.9): self.size size self.gamma gamma self.actions [(-1, 0), (1, 0), (0, -1), (0, 1)] # 上, 下, 左, 右 self.action_names [↑, ↓, ←, →] self.goal (size - 1, size - 1) self.trap (size // 2, size // 2) self.V torch.zeros((size, size), dtypetorch.float32, devicedevice) self.R torch.full((size, size), -0.01, devicedevice) # 每走一步的微小惩罚 self.R[self.goal] 1.0 # 终点大奖 self.R[self.trap] -1.0 # 陷阱惩罚 def get_next_state(self, r, c, action): nr, nc r action[0], c action[1] if 0 nr self.size and 0 nc self.size: return nr, nc return r, c # 撞墙则留在原位 def run_iteration(self, max_steps100, tol1e-6): print(f开始价值迭代 (设备: {self.V.device})...) for i in range(max_steps): v_old self.V.clone() for r in range(self.size): for c in range(self.size): if (r, c) self.goal: continue res [] for action in self.actions: nr, nc self.get_next_state(r, c, action) # V(s) R(s) gamma * V(s) res.append(self.R[nr, nc] self.gamma * v_old[nr, nc]) # 最优贝尔曼方程 self.V[r, c] torch.max(torch.stack(res)) diff torch.max(torch.abs(self.V - v_old)) if diff tol: print(f算法在第 {i1} 步收敛。) break return self.V.cpu().numpy() def get_policy(self): policy np.full((self.size, self.size), , dtypeobject) for r in range(self.size): for c in range(self.size): if (r, c) self.goal: policy[r, c] G continue if (r, c) self.trap: policy[r, c] T best_val -float(inf) best_act for idx, action in enumerate(self.actions): nr, nc self.get_next_state(r, c, action) val self.V[nr, nc].item() if val best_val: best_val val best_act self.action_names[idx] policy[r, c] best_act return policy def visualize(v_matrix, policy): fig, ax plt.subplots(figsize(6, 6)) ax.matshow(v_matrix, cmapcoolwarm) for i in range(v_matrix.shape[0]): for j in range(v_matrix.shape[1]): val v_matrix[i, j] ax.text(j, i, f{val:.2f}\n{policy[i, j]}, vacenter, hacenter, colorblack, fontweightbold) plt.title(State Value V(s) and Optimal Policy) plt.show() if __name__ __main__: solver BellmanVExplorer(size5, gamma0.9) final_v solver.run_iteration() optimal_policy solver.get_policy() print(\n最终状态价值矩阵 V(s):) print(final_v) visualize(final_v, optimal_policy)3.效果展示系统检测正在使用 [cuda] 运行计算... 开始价值迭代 (设备: cuda:0)... 算法在第 9 步收敛。 最终状态价值矩阵 V(s): [[0.42612657 0.48458508 0.54953897 0.62170994 0.70189995] [0.48458508 0.54953897 0.62170994 0.70189995 0.79099995] [0.54953897 0.62170994 0.70189995 0.79099995 0.89 ] [0.62170994 0.70189995 0.79099995 0.89 1. ] [0.70189995 0.79099995 0.89 1. 0. ]]