PIPD

Python Iterated Prisoner's Dilemma, with reinforcement learing. If you know a little python, or are willing to learn, this should be fun to tweak to see what's going on. At the very least, learn about uncommenting.

Download the Python code or just see below.

#!/usr/bin/env python

# Q-learning the Iterated Prisoner's Dilemma
# See http://en.wikipedia.org/wiki/Q-learning
# Peter Corbett, 2013

import math, random

# Standard prisoners dilemma
# You could try chicken or some other game
# 0 = C 1 = D
payoff = (((3,3),(0,5)),
          ((5,0),(1,1)))

class Agent:
    def __init__(self, myhist, othhist):
        self.myhist = myhist
        self.othhist = othhist
        self.payoff = [] # payoff history
        self.q = {} # Learning matrix
        self.expt = 1;
        self.lr = 0.1 # learning rate
        self.expf = 0.95 # Exploration factor - decays exponentially over time
        self.discount = 0.95
        self.expect = 2.25 # expected for one round
        self.dexpect = self.expect * ((1/(1-self.discount))-1) # expected over
        # discount window

    # Tit for Tat - old but gold, not so good at exploiting chumps
    def tft(self):
        return self.othhist[-1]

    # Win-stick-lose-change - this has many merits but can be a bit of a
    # wuss against some strategies
    def pavlov(self):
        if(self.othhist[-1] == 0): return self.myhist[-1]
        return 1-self.myhist[-1]

    # Uses the learned data to decide
    def egreedy(self):
        # The state is the last move by both agents
        state = (self.myhist[-1],self.othhist[-1])
        # random chance of randomly explore - anneal based on length of history
        if(random.random() < (self.expf ** len(self.myhist))):
        # Alternative annealing schedule - lets you see more tit-for-tat
        # later on in the run - also doesn't result in one "dominant" agent
        # alternating cooperate/defect
        #if(random.random() < (10.0/len(self.myhist))):
            return random.randint(0,1)
            # Keep track of how often we've experimented, this was previously
            # there for something but seems to be obsolete...
            self.expt+= 1
        # Otherwise - pick the best
        else:
            return self.pickbest(state)[0]

    # Always cooperate - a chump for exploiting
    def coop(self):
        return 0

    # Always defect - protect yourself by always defecting
    def defect(self):
        return 1

    # Play randomly - another chump
    def rand(self):
        return random.randint(0,1)

    # Q-learning
    def learn(self):
        q = self.q
        # We've already made a move and seen the consequences, so the
        # "state" was the move we just saw the consequences of.
        state = (self.myhist[-2], self.othhist[-2])
        action = self.myhist[-1]
        # We have a state-action pair
        pair = (state,action)
        # Default for state-actions we've never seen
        if pair not in q: q[pair] = self.dexpect
        # What does the future look like, if we pick the next action
        # with the best outcome?
        prospect = self.pickbest((self.myhist[-1],self.othhist[-1]))[1]
        # Here's the magic
        # Weighted average of the old value and a value consisting of the
        # immediate payoff, and a discounted version of the future
        q[pair] = q[pair] + self.lr * (self.payoff[-1] + (self.discount*prospect) - q[pair])

    # look up a state-action pair in q
    def predict(self,state,action):
        if (state,action) not in self.q:
            self.q[(state,action)] = self.dexpect
        return self.q[(state,action)]

    # Look through state-action pairs, pick the best
    def pickbest(self,state):
        besta = -1
        besto = -999999
        for a in [0,1]:
            o = self.predict(state,a)
            if o > besto or (o == besto and random.random() > 0.5):
                besto = o
                besta = a
        return (besta, besto)

# A turn of random history
hista = [random.randint(0,1)]
histb = [random.randint(0,1)]

# Scores
asc = 0
bsc = 0

# Set up two agents

aa = Agent(hista, histb)
ab = Agent(histb, hista)

# Payoff history as for the random moves
aa.payoff.append(payoff[hista[0]][histb[0]][0])
ab.payoff.append(payoff[hista[0]][histb[0]][1])

# set these up to change the strategies eg aa.strat = aa.tft
aa.strat = aa.egreedy
ab.strat = ab.egreedy
# 10k iterations - try altering this
for iter in range(10000):
    # Pick the next move for each agent
    amove = aa.strat()
    bmove = ab.strat()
    # record it
    hista.append(amove)
    histb.append(bmove)
    # Calculate then record payoffs
    apay = payoff[amove][bmove][0]
    bpay = payoff[amove][bmove][1]
    aa.payoff.append(apay)
    ab.payoff.append(bpay)
    # Learn from the history
    aa.learn()
    ab.learn()
    # Tot up the scores
    asc += apay
    bsc += bpay
    # Uncomment for a blow-by-blow account
    #print "%s\t%s\t%s\t%s" % (amove, bmove, asc, bsc)

# Uncomment to see Q
#print aa.q
#print ab.q
# The scores
print asc, bsc