Glicko-2 accuracy with first move advantage¶

http://www.glicko.net/glicko/glicko-boost.pdf suggests a straight-forward extension to Glicko-2 that improves accuracy by taking into account the first move advantage. Let's analyse the predictive power on real-world data.

import math
import collections
import itertools
import pandas

Data set¶

The match results of all standard rated games from ~March 2020~ February 2016 (chosen for its size of 5M games), downloaded and extracted from https://database.lichess.org/.

matches = pandas.read_csv("matches-2016-02.tsv", sep="\t")
matches

We group time controls in the same way as on the site. Ratings across speed groups are considered completely seperate for our purposes.

def speed(time_control):
    if time_control == "-":
        return "correspondence"
    time, inc = time_control.split("+", 1)
    estimate = int(time) + int(inc) * 40
    if estimate < 30:
        return "ultra"
    elif estimate < 180:
        return "bullet"
    elif estimate < 480:
        return "blitz"
    elif estimate < 1500:
        return "rapid"
    else:
        return "classical"

Glicko-2 implementation¶

Downloaded from https://code.google.com/archive/p/pyglicko2/ which is linked from http://www.glicko.net/glicko.html. Patched for a bug with x1 == x2 float comparison without threshold and to allow fractional rating periods.

import glicko2

class Glicko2:
    def __init__(self, rating=1500, rd=350, volatility=0.06, timestamp=None, games=0):
        self.rating = rating
        self.rd = rd
        self.volatility = volatility
        self.timestamp = timestamp
        self.games = games
    
    def _player(self):
        return glicko2.Player(self.rating, self.rd, self.volatility)
    
    def at(self, timestamp):
        player = self._player()
        periods = 0
        if self.timestamp is not None:
            periods = (timestamp - self.timestamp) * 0.21436 / (24 * 60 * 60)
        player._preRatingRD(periods=periods)
        return Glicko2(
            rating=max(self.rating, 600),
            rd=min(player.rd, 350),
            volatility=min(self.volatility, 0.1),
            timestamp=timestamp,
            games=self.games
        )
    
    def after(self, other, result, timestamp, advantage=0):
        us = self.at(timestamp)._player()
        them = other.at(timestamp)
        us.update_player([them.rating - advantage], [them.rd], [result], periods=0)
        return Glicko2(
            rating=us.rating,
            rd=us.rd,
            volatility=us.vol,
            timestamp=timestamp,
            games=self.games + 1
        )
    
    def expected(self, other, timestamp, advantage=0):
        us = self.at(timestamp)
        them = other.at(timestamp)
        return us._player()._E((them.rating - advantage - 1500) / 173.7178, them.rd / 173.7178)
    
    def __repr__(self):
        return f"Glicko2(rating={self.rating}, rd={self.rd}, volatility={self.volatility}, ..., games={self.games})"

Glicko2(rd=60, timestamp=0).at(timestamp=365 * 24 * 60 * 60)

Glicko2(rating=1500, rd=110.00078549354839, volatility=0.06, ..., games=0)

Estimate for first move advantage¶

Based on the distribution of results, we estimate a 11 rating point advantage for White to move.

by_result = matches.groupby("result")["white"].count()
by_result

result
0.0    2334689
0.5     184293
1.0    2496379
Name: white, dtype: int64

black, draw, white = by_result
(white + 0.5 * draw) / (white + draw + black)

0.5161194777404857

Glicko2(rd=60).expected(Glicko2(rd=60), timestamp=0, advantage=11) # rd barely matters here

0.5155458284845135

Analysis¶

To evaluate the modified Glicko-2 system, we use the following metric suggested in https://www.kaggle.com/c/ChessRatings2/overview/evaluation. Lower is better.

def deviance(expected, result):
    expected = min(max(expected, 0.01), 0.99)
    return -(result * math.log10(expected) + (1 - result) * math.log10(1 - expected))

deviance(1.0, 0.0) # Deviance of wrong prediction

1.9999999999999996

deviance(0.5, 0.0) # Deviance of indifferent prediction

0.3010299956639812

deviance(0.0, 0.0) # Deviance of correct prediction

0.004364805402450088

class AggDeviance:
    def __init__(self):
        self.total_deviance = 0
        self.num_games = 0
    
    def add(self, deviance):
        self.total_deviance += deviance
        self.num_games += 1
    
    def avg(self):
        return self.total_deviance / self.num_games

def average_deviance(matches, advantage, ratings=None, by_player=None):
    ratings = ratings if ratings is not None else {}
    agg = AggDeviance()

    for row in matches.itertuples():
        white_key = f"{speed(row.tc)}/{row.white}"
        black_key = f"{speed(row.tc)}/{row.black}"

        white = ratings.get(white_key, None) or Glicko2()
        black = ratings.get(black_key, None) or Glicko2()

        if black.games > 12 and white.games > 12:
            dev = deviance(white.expected(black, row.timestamp, advantage), row.result)
            agg.add(dev)
            
            # Used later on to analyse deviance for individual players.
            if by_player is not None:
                white_agg = by_player.get(white_key, None) or AggDeviance()
                black_agg = by_player.get(black_key, None) or AggDeviance()
                white_agg.add(dev)
                black_agg.add(dev)
                by_player[white_key] = white_agg
                by_player[black_key] = black_agg

        ratings[white_key] = white.after(black, row.result, row.timestamp, advantage)
        ratings[black_key] = black.after(white, 1.0 - row.result, row.timestamp, -advantage)

    return agg.avg()

ratings_0 = {}
by_player_0 = {}
base = average_deviance(matches, 0, ratings_0, by_player_0)
base

0.2689117037192696

To get a feeling for deviance results, let's try another random change, like discarding all timestamps. (However note that all matches are taken from one month.)

matches_without_time = matches.copy()
matches_without_time.timestamp = 0
no_time = average_deviance(matches_without_time, 0)
no_time - base

0.00014296487302772887

Try various values for first move advantage.

hyper_params = pandas.DataFrame([
    [adv, average_deviance(matches, adv)] for adv in [-10, 0, 9, 10, 11, 12, 13, 20, 25, 30]
], columns=["advantage", "deviance"])

hyper_params

plot = hyper_params.plot.scatter(x="advantage", y="deviance")
plot.hlines([base, no_time], xmin=-10, xmax=50)

<matplotlib.collections.LineCollection at 0x7f11f237baf0>

Indeed a 11 rating point advantage for White minimizes the average deviance metric. The improvement is significant. For comparison, see the difference between the two horizontal lines (discarding all timestamps), or the leaderboard on https://www.kaggle.com/c/ChessRatings2/leaderboard.

Impact for individual players¶

The accuracy is better on average, but maybe there are players who are stronger with black. I am not sure how to best analyse this.

Some histograms:

ratings_adv = {}
by_player_adv = {}
average_deviance(matches, 19, ratings_adv, by_player_adv)

0.2688027729155272

df_0 = pandas.DataFrame(((player, agg.avg()) for player, agg in by_player_0.items()), columns=["player", "deviance"])
df_adv = pandas.DataFrame(((player, agg.avg()) for player, agg in by_player_adv.items()), columns=["player", "deviance"])

df_0.hist("deviance", bins=100, range=(0, 1.4))
df_adv.hist("deviance", bins=100, range=(0, 1.4))

array([[<matplotlib.axes._subplots.AxesSubplot object at 0x7f11d192f820>]],
      dtype=object)

df_0.max()

player      rapid/zzzvvv
deviance         1.56659
dtype: object

df_adv.max()

player      rapid/zzzvvv
deviance         1.51161
dtype: object

df_0.quantile(0.999)

deviance    0.771969
Name: 0.999, dtype: float64

df_adv.quantile(0.999)

deviance    0.782326
Name: 0.999, dtype: float64

df_0.quantile(0.99)

deviance    0.465636
Name: 0.99, dtype: float64

df_adv.quantile(0.99)

deviance    0.465412
Name: 0.99, dtype: float64

df_0.quantile(0.9)

deviance    0.322968
Name: 0.9, dtype: float64

df_adv.quantile(0.9)

deviance    0.323037
Name: 0.9, dtype: float64

Looking at the difference between old and new predictions for each individual.

df_diff = pandas.DataFrame(
    ((player, by_player_adv[player].avg() - by_player_0[player].avg()) for player in by_player_0),
    columns=["player", "diff"]
)

df_diff.hist(bins=100)

array([[<matplotlib.axes._subplots.AxesSubplot object at 0x7f11d5178f40>]],
      dtype=object)

df_diff.quantile(0.5)

diff   -0.000111
Name: 0.5, dtype: float64

df_diff.quantile(0.54)

diff    0.00015
Name: 0.54, dtype: float64

df_diff.quantile(0.9)

diff    0.005435
Name: 0.9, dtype: float64

df_diff.max()

player    rapid/zzzvvv
diff          0.069958
dtype: object

ratings_0["rapid/zzzvvv"]

Glicko2(rating=1562.7688378887872, rd=37.13931149783258, volatility=0.05999964836520892, ..., games=125)

ratings_adv["rapid/zzzvvv"]

Glicko2(rating=1564.2541375577578, rd=37.15162961800069, volatility=0.060002214820252926, ..., games=125)

Same analysis, but for individuals with at least 100 games:

df_diff_many_games = pandas.DataFrame(
    ((player, by_player_adv[player].avg() - by_player_0[player].avg())
         for player in by_player_0
         if by_player_adv[player].num_games > 100),
    columns=["player", "diff"]
)

df_diff_many_games.hist(bins=50)

array([[<matplotlib.axes._subplots.AxesSubplot object at 0x7f11ec37fe80>]],
      dtype=object)

df_diff_many_games.quantile(0.5)

diff   -0.000083
Name: 0.5, dtype: float64

df_diff_many_games.quantile(0.54)

diff    0.000047
Name: 0.54, dtype: float64

df_diff_many_games.quantile(0.9)

diff    0.001822
Name: 0.9, dtype: float64

df_diff_many_games.max()

player    rapid/zyk67
diff       0.00697541
dtype: object

ratings_0["rapid/zyk67"]

Glicko2(rating=1894.6296104539315, rd=35.18070919962021, volatility=0.05990030667358853, ..., games=132)

ratings_adv["rapid/zyk67"]

Glicko2(rating=1896.0176423946907, rd=35.211079023318554, volatility=0.05990317848135385, ..., games=132)

Looking at the difference between the ratings:

df_rating_diff = pandas.DataFrame(
    ((player, ratings_adv[player].rating - ratings_0[player].rating)
        for player in ratings_0
        if ratings_0[player].games > 12),
    columns=["player", "rating diff"]
)

df_rating_diff.hist(bins=50)

array([[<matplotlib.axes._subplots.AxesSubplot object at 0x7f11d15edb50>]],
      dtype=object)

df_rating_diff.min()

player         blitz/-ArtanS-
rating diff          -22.8246
dtype: object

df_rating_diff.max()

player         rapid/zzzvvv
rating diff         19.8705
dtype: object

df_rating_diff.quantile(0.1)

rating diff   -2.701815
Name: 0.1, dtype: float64

	timestamp	tc	white	black	result
0	1454281202	300+0	Wallerdos	chilico	0.0
1	1454281202	300+0	patatero	hichamsbitri	1.0
2	1454281202	300+0	Ayman22	daamien	1.0
3	1454281202	300+0	shadowtiger	BillDecant5	1.0
4	1454281202	60+0	communist23	adrielpavan	1.0
...	...	...	...	...	...
5015356	1456786798	0+1	ribarisah	Lotreamon	0.0
5015357	1456786799	360+6	MRPAYEN06	masoudyyyyy	1.0
5015358	1456786799	300+0	TOMBALABOMBA	wisdan1	0.0
5015359	1456786791	120+1	JasonKill	Moile	0.0
5015360	1456786798	30+0	neromagik	demiridis	1.0

	advantage	deviance
0	-10	0.269405
1	0	0.268912
2	9	0.268725
3	10	0.268719
4	11	0.268717
5	12	0.268717
6	13	0.268720
7	20	0.268827
8	25	0.268993
9	30	0.269234