PIPD

Python Iterated Prisoner's Dilemma, with reinforcement learing. If you know a little python, or are willing to learn, this should be fun to tweak to see what's going on. At the very least, learn about uncommenting.

#!/usr/bin/env python

# Q-learning the Iterated Prisoner's Dilemma
# See http://en.wikipedia.org/wiki/Q-learning
# Peter Corbett, 2013

import math, random

# Standard prisoners dilemma
# You could try chicken or some other game
# 0 = C 1 = D
payoff = (((3,3),(0,5)),
((5,0),(1,1)))

class Agent:
def __init__(self, myhist, othhist):
self.myhist = myhist
self.othhist = othhist
self.payoff = [] # payoff history
self.q = {} # Learning matrix
self.expt = 1;
self.lr = 0.1 # learning rate
self.expf = 0.95 # Exploration factor - decays exponentially over time
self.discount = 0.95
self.expect = 2.25 # expected for one round
self.dexpect = self.expect * ((1/(1-self.discount))-1) # expected over
# discount window

# Tit for Tat - old but gold, not so good at exploiting chumps
def tft(self):
return self.othhist[-1]

# Win-stick-lose-change - this has many merits but can be a bit of a
# wuss against some strategies
def pavlov(self):
if(self.othhist[-1] == 0): return self.myhist[-1]
return 1-self.myhist[-1]

# Uses the learned data to decide
def egreedy(self):
# The state is the last move by both agents
state = (self.myhist[-1],self.othhist[-1])
# random chance of randomly explore - anneal based on length of history
if(random.random() < (self.expf ** len(self.myhist))):
# Alternative annealing schedule - lets you see more tit-for-tat
# later on in the run - also doesn't result in one "dominant" agent
# alternating cooperate/defect
#if(random.random() < (10.0/len(self.myhist))):
return random.randint(0,1)
# Keep track of how often we've experimented, this was previously
# there for something but seems to be obsolete...
self.expt+= 1
# Otherwise - pick the best
else:
return self.pickbest(state)

# Always cooperate - a chump for exploiting
def coop(self):
return 0

# Always defect - protect yourself by always defecting
def defect(self):
return 1

# Play randomly - another chump
def rand(self):
return random.randint(0,1)

# Q-learning
def learn(self):
q = self.q
# We've already made a move and seen the consequences, so the
# "state" was the move we just saw the consequences of.
state = (self.myhist[-2], self.othhist[-2])
action = self.myhist[-1]
# We have a state-action pair
pair = (state,action)
# Default for state-actions we've never seen
if pair not in q: q[pair] = self.dexpect
# What does the future look like, if we pick the next action
# with the best outcome?
prospect = self.pickbest((self.myhist[-1],self.othhist[-1]))
# Here's the magic
# Weighted average of the old value and a value consisting of the
# immediate payoff, and a discounted version of the future
q[pair] = q[pair] + self.lr * (self.payoff[-1] + (self.discount*prospect) - q[pair])

# look up a state-action pair in q
def predict(self,state,action):
if (state,action) not in self.q:
self.q[(state,action)] = self.dexpect
return self.q[(state,action)]

# Look through state-action pairs, pick the best
def pickbest(self,state):
besta = -1
besto = -999999
for a in [0,1]:
o = self.predict(state,a)
if o > besto or (o == besto and random.random() > 0.5):
besto = o
besta = a
return (besta, besto)

# A turn of random history
hista = [random.randint(0,1)]
histb = [random.randint(0,1)]

# Scores
asc = 0
bsc = 0

# Set up two agents

aa = Agent(hista, histb)
ab = Agent(histb, hista)

# Payoff history as for the random moves
aa.payoff.append(payoff[hista][histb])
ab.payoff.append(payoff[hista][histb])

# set these up to change the strategies eg aa.strat = aa.tft
aa.strat = aa.egreedy
ab.strat = ab.egreedy
# 10k iterations - try altering this
for iter in range(10000):
# Pick the next move for each agent
amove = aa.strat()
bmove = ab.strat()
# record it
hista.append(amove)
histb.append(bmove)
# Calculate then record payoffs
apay = payoff[amove][bmove]
bpay = payoff[amove][bmove]
aa.payoff.append(apay)
ab.payoff.append(bpay)
# Learn from the history
aa.learn()
ab.learn()
# Tot up the scores
asc += apay
bsc += bpay
# Uncomment for a blow-by-blow account
#print "%s\t%s\t%s\t%s" % (amove, bmove, asc, bsc)

# Uncomment to see Q
#print aa.q
#print ab.q
# The scores
print asc, bsc