Go语言深度学习:强化学习与智能决策
Go语言深度学习强化学习与智能决策强化学习RL是机器学习的重要分支关注智能体如何在环境中采取行动以最大化累积奖励。本文将深入探讨如何使用Go语言实现强化学习算法并构建智能决策系统。一、强化学习概述强化学习的核心要素包括智能体Agent执行动作的实体环境Environment智能体所处的世界状态State环境的当前情况动作Action智能体可以执行的操作奖励Reward环境对动作的反馈策略Policy智能体选择动作的规则二、Q-Learning实现2.1 算法原理Q-Learning是一种值迭代算法学习动作价值函数Q(s, a)表示在状态s下采取动作a的预期回报。2.2 Go语言实现package main import ( fmt math math/rand ) type QLearner struct { Q map[string]map[int]float64 learningRate float64 discountFactor float64 explorationRate float64 actions []int } func NewQLearner(actions []int, lr, df, er float64) *QLearner { return QLearner{ Q: make(map[string]map[int]float64), learningRate: lr, discountFactor: df, explorationRate: er, actions: actions, } } func (ql *QLearner) getQ(state string, action int) float64 { if _, ok : ql.Q[state]; !ok { ql.Q[state] make(map[int]float64) } if val, ok : ql.Q[state][action]; ok { return val } return 0 } func (ql *QLearner) setQ(state string, action int, value float64) { if _, ok : ql.Q[state]; !ok { ql.Q[state] make(map[int]float64) } ql.Q[state][action] value } func (ql *QLearner) chooseAction(state string) int { // epsilon-greedy策略 if rand.Float64() ql.explorationRate { return ql.actions[rand.Intn(len(ql.actions))] } // 选择Q值最大的动作 maxQ : math.Inf(-1) bestAction : ql.actions[0] for _, action : range ql.actions { q : ql.getQ(state, action) if q maxQ { maxQ q bestAction action } } return bestAction } func (ql *QLearner) learn(state, nextState string, action int, reward float64) { currentQ : ql.getQ(state, action) // 找到下一状态的最大Q值 maxNextQ : math.Inf(-1) for _, a : range ql.actions { q : ql.getQ(nextState, a) if q maxNextQ { maxNextQ q } } // Q-learning更新公式 newQ : currentQ ql.learningRate*(rewardql.discountFactor*maxNextQ-currentQ) ql.setQ(state, action, newQ) }2.3 使用示例func main() { actions : []int{0, 1, 2} // 三个动作左、中、右 ql : NewQLearner(actions, 0.1, 0.9, 0.1) // 模拟训练过程 for episode : 0; episode 1000; episode { state : start for { action : ql.chooseAction(state) // 模拟环境反馈 nextState, reward, done : simulateEnvironment(state, action) ql.learn(state, nextState, action, reward) state nextState if done { break } } } fmt.Println(训练完成) fmt.Println(Q表:, ql.Q) } func simulateEnvironment(state string, action int) (string, float64, bool) { // 简化的环境模拟 if state goal { return goal, 0, true } if action 1 { // 中间动作到达目标 return goal, 10, true } return start, -1, false }三、SARSA算法实现type SARSALearner struct { Q map[string]map[int]float64 learningRate float64 discountFactor float64 explorationRate float64 actions []int } func NewSARSALearner(actions []int, lr, df, er float64) *SARSALearner { return SARSALearner{ Q: make(map[string]map[int]float64), learningRate: lr, discountFactor: df, explorationRate: er, actions: actions, } } func (sarsa *SARSALearner) getQ(state string, action int) float64 { if _, ok : sarsa.Q[state]; !ok { sarsa.Q[state] make(map[int]float64) } if val, ok : sarsa.Q[state][action]; ok { return val } return 0 } func (sarsa *SARSALearner) setQ(state string, action int, value float64) { if _, ok : sarsa.Q[state]; !ok { sarsa.Q[state] make(map[int]float64) } sarsa.Q[state][action] value } func (sarsa *SARSALearner) chooseAction(state string) int { if rand.Float64() sarsa.explorationRate { return sarsa.actions[rand.Intn(len(sarsa.actions))] } maxQ : math.Inf(-1) bestAction : sarsa.actions[0] for _, action : range sarsa.actions { q : sarsa.getQ(state, action) if q maxQ { maxQ q bestAction action } } return bestAction } func (sarsa *SARSALearner) learn(state, nextState string, action, nextAction int, reward float64) { currentQ : sarsa.getQ(state, action) nextQ : sarsa.getQ(nextState, nextAction) // SARSA更新公式 newQ : currentQ sarsa.learningRate*(rewardsarsa.discountFactor*nextQ-currentQ) sarsa.setQ(state, action, newQ) }四、Deep Q-Network (DQN)实现type DQN struct { model *NeuralNetwork targetModel *NeuralNetwork replayBuffer []Experience batchSize int learningRate float64 discountFactor float64 explorationRate float64 } type Experience struct { state []float64 action int reward float64 nextState []float64 done bool } func NewDQN(stateSize, actionSize int) *DQN { dqn : DQN{ model: NewNeuralNetwork(stateSize, 64, actionSize), targetModel: NewNeuralNetwork(stateSize, 64, actionSize), replayBuffer: make([]Experience, 0), batchSize: 32, learningRate: 0.001, discountFactor: 0.99, explorationRate: 1.0, } dqn.updateTargetModel() return dqn } func (dqn *DQN) updateTargetModel() { dqn.targetModel.weights make([][]float64, len(dqn.model.weights)) for i : range dqn.model.weights { dqn.targetModel.weights[i] make([]float64, len(dqn.model.weights[i])) copy(dqn.targetModel.weights[i], dqn.model.weights[i]) } } func (dqn *DQN) chooseAction(state []float64) int { if rand.Float64() dqn.explorationRate { return rand.Intn(len(dqn.model.weights)) } qValues : dqn.model.forward(state) maxIdx : 0 maxVal : qValues[0] for i, val : range qValues { if val maxVal { maxVal val maxIdx i } } return maxIdx } func (dqn *DQN) addExperience(exp Experience) { dqn.replayBuffer append(dqn.replayBuffer, exp) if len(dqn.replayBuffer) 10000 { dqn.replayBuffer dqn.replayBuffer[1:] } } func (dqn *DQN) train() { if len(dqn.replayBuffer) dqn.batchSize { return } // 随机采样 batch : make([]Experience, dqn.batchSize) for i : 0; i dqn.batchSize; i { idx : rand.Intn(len(dqn.replayBuffer)) batch[i] dqn.replayBuffer[idx] } // 计算目标Q值 for _, exp : range batch { target : exp.reward if !exp.done { nextQ : dqn.targetModel.forward(exp.nextState) maxNextQ : math.Inf(-1) for _, q : range nextQ { if q maxNextQ { maxNextQ q } } target dqn.discountFactor * maxNextQ } // 更新模型 dqn.model.train(exp.state, target, exp.action, dqn.learningRate) } // 衰减探索率 dqn.explorationRate * 0.995 if dqn.explorationRate 0.01 { dqn.explorationRate 0.01 } }五、策略梯度实现type PolicyGradient struct { model *NeuralNetwork learningRate float64 discountFactor float64 rewards []float64 logProbs []float64 } func NewPolicyGradient(stateSize, actionSize int) *PolicyGradient { return PolicyGradient{ model: NewNeuralNetwork(stateSize, 64, actionSize), learningRate: 0.001, discountFactor: 0.99, rewards: make([]float64, 0), logProbs: make([]float64, 0), } } func (pg *PolicyGradient) chooseAction(state []float64) int { logits : pg.model.forward(state) probs : softmax(logits) // 根据概率选择动作 randVal : rand.Float64() cumulative : 0.0 for i, prob : range probs { cumulative prob if randVal cumulative { // 记录对数概率 pg.logProbs append(pg.logProbs, math.Log(prob)) return i } } return len(probs) - 1 } func (pg *PolicyGradient) addReward(reward float64) { pg.rewards append(pg.rewards, reward) } func (pg *PolicyGradient) train() { // 计算折扣回报 discountedRewards : make([]float64, len(pg.rewards)) runningSum : 0.0 for i : len(pg.rewards) - 1; i 0; i-- { runningSum pg.rewards[i] pg.discountFactor*runningSum discountedRewards[i] runningSum } // 标准化回报 mean : 0.0 for _, r : range discountedRewards { mean r } mean / float64(len(discountedRewards)) variance : 0.0 for _, r : range discountedRewards { variance math.Pow(r-mean, 2) } variance math.Sqrt(variance / float64(len(discountedRewards))) for i : range discountedRewards { discountedRewards[i] (discountedRewards[i] - mean) / (variance 1e-8) } // 更新策略 for i : 0; i len(pg.logProbs); i { pg.model.updatePolicy(pg.logProbs[i], discountedRewards[i], pg.learningRate) } // 重置 pg.rewards make([]float64, 0) pg.logProbs make([]float64, 0) } func softmax(x []float64) []float64 { maxVal : math.Inf(-1) for _, v : range x { if v maxVal { maxVal v } } exp : make([]float64, len(x)) var sum float64 for i, v : range x { exp[i] math.Exp(v - maxVal) sum exp[i] } output : make([]float64, len(x)) for i : range output { output[i] exp[i] / sum } return output }六、Actor-Critic实现type ActorCritic struct { actor *NeuralNetwork critic *NeuralNetwork learningRate float64 discountFactor float64 } func NewActorCritic(stateSize, actionSize int) *ActorCritic { return ActorCritic{ actor: NewNeuralNetwork(stateSize, 64, actionSize), critic: NewNeuralNetwork(stateSize, 64, 1), learningRate: 0.001, discountFactor: 0.99, } } func (ac *ActorCritic) chooseAction(state []float64) int { logits : ac.actor.forward(state) probs : softmax(logits) randVal : rand.Float64() cumulative : 0.0 for i, prob : range probs { cumulative prob if randVal cumulative { return i } } return len(probs) - 1 } func (ac *ActorCritic) train(state, nextState []float64, action int, reward float64, done bool) { // 计算价值估计 value : ac.critic.forward(state)[0] nextValue : 0.0 if !done { nextValue ac.critic.forward(nextState)[0] } // 计算优势 advantage : reward ac.discountFactor*nextValue - value // 更新Actor ac.actor.updatePolicyWithAdvantage(state, action, advantage, ac.learningRate) // 更新Critic target : reward ac.discountFactor*nextValue ac.critic.updateValue(state, target, ac.learningRate) }七、实战CartPole环境func main() { dqn : NewDQN(4, 2) // 4维状态2个动作 for episode : 0; episode 1000; episode { state : resetCartPole() totalReward : 0.0 for { action : dqn.chooseAction(state) nextState, reward, done : stepCartPole(action) dqn.addExperience(Experience{ state: state, action: action, reward: reward, nextState: nextState, done: done, }) totalReward reward state nextState if done { break } } dqn.train() if episode%100 0 { fmt.Printf(Episode %d: Reward %.2f\n, episode, totalReward) } } }八、总结本文介绍了强化学习的核心算法及其Go语言实现Q-Learning基于值迭代的经典算法SARSA在线策略学习算法DQN深度Q网络结合深度学习与强化学习策略梯度直接优化策略函数Actor-Critic结合策略梯度和值函数Go语言的高性能特性使其成为构建强化学习系统的理想选择特别是在需要高并发和低延迟的场景中。