# 使用PyTorch实现随机搜索策略

## 2. 使用 PyTorch 实现随机搜索算法

import gym
import torch
from matplotlib import pyplot as plt
env = gym.make('CartPole-v0')

n_state = env.observation_space.shape[0]
print(n_state)
# 4
n_action = env.action_space.n
print(n_action)
# 2

def run_episode(env, weight):
state = env.reset()
total_reward = 0
is_done = False
while not is_done:
state = torch.from_numpy(state).float()
action = torch.argmax(torch.matmul(state, weight))
state, reward, is_done, _ = env.step(action.item())
total_reward += reward
return total_reward

n_episode = 1000
best_total_reward = 0
best_weight = None
total_rewards = []

• 构建随机权重矩阵
• 智能体根据权重矩阵将状态映射到相应的动作
• 回合终止并返回总奖励
• 更新最佳总奖励和最佳权重，并记录总奖励
for e in range(n_episode):
weight = torch.rand(n_state, n_action)
total_reward = run_episode(env, weight)
print('Episode {}: {}'.format(e+1, total_reward))
if total_reward > best_total_reward:
best_weight = weight
best_total_reward = total_reward
total_rewards.append(total_reward)

print('Average total reward over {} episode: {}'.format(n_episode, sum(total_rewards) / n_episode))
# Average total reward over 1000 episode: 46.722

n_episode_eval = 1000
total_rewards_eval = []
for episode in range(n_episode_eval):
total_reward = run_episode(env, best_weight)
print('Episode {}: {}'.format(episode+1, total_reward))
total_rewards_eval.append(total_reward)

print('Average total reward over {} episode: {}'.format(n_episode_eval, sum(total_rewards_eval) / n_episode_eval))
# Average total reward over 1000 episode: 114.786

plt.plot(total_rewards, label='search')
plt.plot(total_rewards_eval, label='eval')
plt.xlabel('episode')
plt.ylabel('total_reward')
plt.legend()
plt.show()

n_episode = 1000
best_total_reward = 0
best_weight = None
total_rewards = []
for episode in range(n_episode):
weight = torch.rand(n_state, n_action)
total_reward = run_episode(env, weight)
print('Episode {}: {}'.format(episode+1, total_reward))
if total_reward > best_total_reward:
best_weight = weight
best_total_reward = total_reward
total_rewards.append(total_reward)
if best_total_reward == 200:
break

n_training = 1000
n_episode_training = []
for _ in range(n_training):
for episode in range(n_episode):
weight = torch.rand(n_state, n_action)
total_reward = run_episode(env, weight)
if total_reward == 200:
n_episode_training.append(episode+1)
break
print('Expectation of training episodes needed: ', sum(n_episode_training) / n_training)
# Expectation of training episodes needed:  14.26