Python Iterated Prisoner's Dilemma, with reinforcement learing. If you know a little python, or are willing to learn, this should be fun to tweak to see what's going on. At the very least, learn about uncommenting.
Download the Python code or just see below.
#!/usr/bin/env python
# Q-learning the Iterated Prisoner's Dilemma
# See http://en.wikipedia.org/wiki/Q-learning
# Peter Corbett, 2013
import math, random
# Standard prisoners dilemma
# You could try chicken or some other game
# 0 = C 1 = D
payoff = (((3,3),(0,5)),
((5,0),(1,1)))
class Agent:
def __init__(self, myhist, othhist):
self.myhist = myhist
self.othhist = othhist
self.payoff = [] # payoff history
self.q = {} # Learning matrix
self.expt = 1;
self.lr = 0.1 # learning rate
self.expf = 0.95 # Exploration factor - decays exponentially over time
self.discount = 0.95
self.expect = 2.25 # expected for one round
self.dexpect = self.expect * ((1/(1-self.discount))-1) # expected over
# discount window
# Tit for Tat - old but gold, not so good at exploiting chumps
def tft(self):
return self.othhist[-1]
# Win-stick-lose-change - this has many merits but can be a bit of a
# wuss against some strategies
def pavlov(self):
if(self.othhist[-1] == 0): return self.myhist[-1]
return 1-self.myhist[-1]
# Uses the learned data to decide
def egreedy(self):
# The state is the last move by both agents
state = (self.myhist[-1],self.othhist[-1])
# random chance of randomly explore - anneal based on length of history
if(random.random() < (self.expf ** len(self.myhist))):
# Alternative annealing schedule - lets you see more tit-for-tat
# later on in the run - also doesn't result in one "dominant" agent
# alternating cooperate/defect
#if(random.random() < (10.0/len(self.myhist))):
return random.randint(0,1)
# Keep track of how often we've experimented, this was previously
# there for something but seems to be obsolete...
self.expt+= 1
# Otherwise - pick the best
else:
return self.pickbest(state)[0]
# Always cooperate - a chump for exploiting
def coop(self):
return 0
# Always defect - protect yourself by always defecting
def defect(self):
return 1
# Play randomly - another chump
def rand(self):
return random.randint(0,1)
# Q-learning
def learn(self):
q = self.q
# We've already made a move and seen the consequences, so the
# "state" was the move we just saw the consequences of.
state = (self.myhist[-2], self.othhist[-2])
action = self.myhist[-1]
# We have a state-action pair
pair = (state,action)
# Default for state-actions we've never seen
if pair not in q: q[pair] = self.dexpect
# What does the future look like, if we pick the next action
# with the best outcome?
prospect = self.pickbest((self.myhist[-1],self.othhist[-1]))[1]
# Here's the magic
# Weighted average of the old value and a value consisting of the
# immediate payoff, and a discounted version of the future
q[pair] = q[pair] + self.lr * (self.payoff[-1] + (self.discount*prospect) - q[pair])
# look up a state-action pair in q
def predict(self,state,action):
if (state,action) not in self.q:
self.q[(state,action)] = self.dexpect
return self.q[(state,action)]
# Look through state-action pairs, pick the best
def pickbest(self,state):
besta = -1
besto = -999999
for a in [0,1]:
o = self.predict(state,a)
if o > besto or (o == besto and random.random() > 0.5):
besto = o
besta = a
return (besta, besto)
# A turn of random history
hista = [random.randint(0,1)]
histb = [random.randint(0,1)]
# Scores
asc = 0
bsc = 0
# Set up two agents
aa = Agent(hista, histb)
ab = Agent(histb, hista)
# Payoff history as for the random moves
aa.payoff.append(payoff[hista[0]][histb[0]][0])
ab.payoff.append(payoff[hista[0]][histb[0]][1])
# set these up to change the strategies eg aa.strat = aa.tft
aa.strat = aa.egreedy
ab.strat = ab.egreedy
# 10k iterations - try altering this
for iter in range(10000):
# Pick the next move for each agent
amove = aa.strat()
bmove = ab.strat()
# record it
hista.append(amove)
histb.append(bmove)
# Calculate then record payoffs
apay = payoff[amove][bmove][0]
bpay = payoff[amove][bmove][1]
aa.payoff.append(apay)
ab.payoff.append(bpay)
# Learn from the history
aa.learn()
ab.learn()
# Tot up the scores
asc += apay
bsc += bpay
# Uncomment for a blow-by-blow account
#print "%s\t%s\t%s\t%s" % (amove, bmove, asc, bsc)
# Uncomment to see Q
#print aa.q
#print ab.q
# The scores
print asc, bsc