# PIPD

Python Iterated Prisoner's Dilemma, with reinforcement learing. If you know a little python, or are willing to learn, this should be fun to tweak to see what's going on. At the very least, learn about uncommenting.

```#!/usr/bin/env python # Q-learning the Iterated Prisoner's Dilemma # See http://en.wikipedia.org/wiki/Q-learning # Peter Corbett, 2013 import math, random # Standard prisoners dilemma # You could try chicken or some other game # 0 = C 1 = D payoff = (((3,3),(0,5)), ((5,0),(1,1))) class Agent: def __init__(self, myhist, othhist): self.myhist = myhist self.othhist = othhist self.payoff = [] # payoff history self.q = {} # Learning matrix self.expt = 1; self.lr = 0.1 # learning rate self.expf = 0.95 # Exploration factor - decays exponentially over time self.discount = 0.95 self.expect = 2.25 # expected for one round self.dexpect = self.expect * ((1/(1-self.discount))-1) # expected over # discount window # Tit for Tat - old but gold, not so good at exploiting chumps def tft(self): return self.othhist[-1] # Win-stick-lose-change - this has many merits but can be a bit of a # wuss against some strategies def pavlov(self): if(self.othhist[-1] == 0): return self.myhist[-1] return 1-self.myhist[-1] # Uses the learned data to decide def egreedy(self): # The state is the last move by both agents state = (self.myhist[-1],self.othhist[-1]) # random chance of randomly explore - anneal based on length of history if(random.random() < (self.expf ** len(self.myhist))): # Alternative annealing schedule - lets you see more tit-for-tat # later on in the run - also doesn't result in one "dominant" agent # alternating cooperate/defect #if(random.random() < (10.0/len(self.myhist))): return random.randint(0,1) # Keep track of how often we've experimented, this was previously # there for something but seems to be obsolete... self.expt+= 1 # Otherwise - pick the best else: return self.pickbest(state)[0] # Always cooperate - a chump for exploiting def coop(self): return 0 # Always defect - protect yourself by always defecting def defect(self): return 1 # Play randomly - another chump def rand(self): return random.randint(0,1) # Q-learning def learn(self): q = self.q # We've already made a move and seen the consequences, so the # "state" was the move we just saw the consequences of. state = (self.myhist[-2], self.othhist[-2]) action = self.myhist[-1] # We have a state-action pair pair = (state,action) # Default for state-actions we've never seen if pair not in q: q[pair] = self.dexpect # What does the future look like, if we pick the next action # with the best outcome? prospect = self.pickbest((self.myhist[-1],self.othhist[-1]))[1] # Here's the magic # Weighted average of the old value and a value consisting of the # immediate payoff, and a discounted version of the future q[pair] = q[pair] + self.lr * (self.payoff[-1] + (self.discount*prospect) - q[pair]) # look up a state-action pair in q def predict(self,state,action): if (state,action) not in self.q: self.q[(state,action)] = self.dexpect return self.q[(state,action)] # Look through state-action pairs, pick the best def pickbest(self,state): besta = -1 besto = -999999 for a in [0,1]: o = self.predict(state,a) if o > besto or (o == besto and random.random() > 0.5): besto = o besta = a return (besta, besto) # A turn of random history hista = [random.randint(0,1)] histb = [random.randint(0,1)] # Scores asc = 0 bsc = 0 # Set up two agents aa = Agent(hista, histb) ab = Agent(histb, hista) # Payoff history as for the random moves aa.payoff.append(payoff[hista[0]][histb[0]][0]) ab.payoff.append(payoff[hista[0]][histb[0]][1]) # set these up to change the strategies eg aa.strat = aa.tft aa.strat = aa.egreedy ab.strat = ab.egreedy # 10k iterations - try altering this for iter in range(10000): # Pick the next move for each agent amove = aa.strat() bmove = ab.strat() # record it hista.append(amove) histb.append(bmove) # Calculate then record payoffs apay = payoff[amove][bmove][0] bpay = payoff[amove][bmove][1] aa.payoff.append(apay) ab.payoff.append(bpay) # Learn from the history aa.learn() ab.learn() # Tot up the scores asc += apay bsc += bpay # Uncomment for a blow-by-blow account #print "%s\t%s\t%s\t%s" % (amove, bmove, asc, bsc) # Uncomment to see Q #print aa.q #print ab.q # The scores print asc, bsc ```