http://www.glicko.net/glicko/glicko-boost.pdf suggests a straight-forward extension to Glicko-2 that improves accuracy by taking into account the first move advantage. Let's analyse the predictive power on real-world data.
import math
import collections
import itertools
import pandas
The match results of all standard rated games from ~March 2020~ February 2016 (chosen for its size of 5M games), downloaded and extracted from https://database.lichess.org/.
matches = pandas.read_csv("matches-2016-02.tsv", sep="\t")
matches
We group time controls in the same way as on the site. Ratings across speed groups are considered completely seperate for our purposes.
def speed(time_control):
if time_control == "-":
return "correspondence"
time, inc = time_control.split("+", 1)
estimate = int(time) + int(inc) * 40
if estimate < 30:
return "ultra"
elif estimate < 180:
return "bullet"
elif estimate < 480:
return "blitz"
elif estimate < 1500:
return "rapid"
else:
return "classical"
Downloaded from https://code.google.com/archive/p/pyglicko2/ which is linked from http://www.glicko.net/glicko.html. Patched for a bug with x1 == x2
float comparison without threshold and to allow fractional rating periods.
import glicko2
class Glicko2:
def __init__(self, rating=1500, rd=350, volatility=0.06, timestamp=None, games=0):
self.rating = rating
self.rd = rd
self.volatility = volatility
self.timestamp = timestamp
self.games = games
def _player(self):
return glicko2.Player(self.rating, self.rd, self.volatility)
def at(self, timestamp):
player = self._player()
periods = 0
if self.timestamp is not None:
periods = (timestamp - self.timestamp) * 0.21436 / (24 * 60 * 60)
player._preRatingRD(periods=periods)
return Glicko2(
rating=max(self.rating, 600),
rd=min(player.rd, 350),
volatility=min(self.volatility, 0.1),
timestamp=timestamp,
games=self.games
)
def after(self, other, result, timestamp, advantage=0):
us = self.at(timestamp)._player()
them = other.at(timestamp)
us.update_player([them.rating - advantage], [them.rd], [result], periods=0)
return Glicko2(
rating=us.rating,
rd=us.rd,
volatility=us.vol,
timestamp=timestamp,
games=self.games + 1
)
def expected(self, other, timestamp, advantage=0):
us = self.at(timestamp)
them = other.at(timestamp)
return us._player()._E((them.rating - advantage - 1500) / 173.7178, them.rd / 173.7178)
def __repr__(self):
return f"Glicko2(rating={self.rating}, rd={self.rd}, volatility={self.volatility}, ..., games={self.games})"
Glicko2(rd=60, timestamp=0).at(timestamp=365 * 24 * 60 * 60)
Based on the distribution of results, we estimate a 11
rating point advantage for White to move.
by_result = matches.groupby("result")["white"].count()
by_result
black, draw, white = by_result
(white + 0.5 * draw) / (white + draw + black)
Glicko2(rd=60).expected(Glicko2(rd=60), timestamp=0, advantage=11) # rd barely matters here
To evaluate the modified Glicko-2 system, we use the following metric suggested in https://www.kaggle.com/c/ChessRatings2/overview/evaluation. Lower is better.
def deviance(expected, result):
expected = min(max(expected, 0.01), 0.99)
return -(result * math.log10(expected) + (1 - result) * math.log10(1 - expected))
deviance(1.0, 0.0) # Deviance of wrong prediction
deviance(0.5, 0.0) # Deviance of indifferent prediction
deviance(0.0, 0.0) # Deviance of correct prediction
class AggDeviance:
def __init__(self):
self.total_deviance = 0
self.num_games = 0
def add(self, deviance):
self.total_deviance += deviance
self.num_games += 1
def avg(self):
return self.total_deviance / self.num_games
def average_deviance(matches, advantage, ratings=None, by_player=None):
ratings = ratings if ratings is not None else {}
agg = AggDeviance()
for row in matches.itertuples():
white_key = f"{speed(row.tc)}/{row.white}"
black_key = f"{speed(row.tc)}/{row.black}"
white = ratings.get(white_key, None) or Glicko2()
black = ratings.get(black_key, None) or Glicko2()
if black.games > 12 and white.games > 12:
dev = deviance(white.expected(black, row.timestamp, advantage), row.result)
agg.add(dev)
# Used later on to analyse deviance for individual players.
if by_player is not None:
white_agg = by_player.get(white_key, None) or AggDeviance()
black_agg = by_player.get(black_key, None) or AggDeviance()
white_agg.add(dev)
black_agg.add(dev)
by_player[white_key] = white_agg
by_player[black_key] = black_agg
ratings[white_key] = white.after(black, row.result, row.timestamp, advantage)
ratings[black_key] = black.after(white, 1.0 - row.result, row.timestamp, -advantage)
return agg.avg()
ratings_0 = {}
by_player_0 = {}
base = average_deviance(matches, 0, ratings_0, by_player_0)
base
To get a feeling for deviance results, let's try another random change, like discarding all timestamps. (However note that all matches are taken from one month.)
matches_without_time = matches.copy()
matches_without_time.timestamp = 0
no_time = average_deviance(matches_without_time, 0)
no_time - base
Try various values for first move advantage.
hyper_params = pandas.DataFrame([
[adv, average_deviance(matches, adv)] for adv in [-10, 0, 9, 10, 11, 12, 13, 20, 25, 30]
], columns=["advantage", "deviance"])
hyper_params
plot = hyper_params.plot.scatter(x="advantage", y="deviance")
plot.hlines([base, no_time], xmin=-10, xmax=50)
Indeed a 11
rating point advantage for White minimizes the average deviance metric. The improvement is significant. For comparison, see the difference between the two horizontal lines (discarding all timestamps), or the leaderboard on https://www.kaggle.com/c/ChessRatings2/leaderboard.
The accuracy is better on average, but maybe there are players who are stronger with black. I am not sure how to best analyse this.
Some histograms:
ratings_adv = {}
by_player_adv = {}
average_deviance(matches, 19, ratings_adv, by_player_adv)
df_0 = pandas.DataFrame(((player, agg.avg()) for player, agg in by_player_0.items()), columns=["player", "deviance"])
df_adv = pandas.DataFrame(((player, agg.avg()) for player, agg in by_player_adv.items()), columns=["player", "deviance"])
df_0.hist("deviance", bins=100, range=(0, 1.4))
df_adv.hist("deviance", bins=100, range=(0, 1.4))
df_0.max()
df_adv.max()
df_0.quantile(0.999)
df_adv.quantile(0.999)
df_0.quantile(0.99)
df_adv.quantile(0.99)
df_0.quantile(0.9)
df_adv.quantile(0.9)
Looking at the difference between old and new predictions for each individual.
df_diff = pandas.DataFrame(
((player, by_player_adv[player].avg() - by_player_0[player].avg()) for player in by_player_0),
columns=["player", "diff"]
)
df_diff.hist(bins=100)
df_diff.quantile(0.5)
df_diff.quantile(0.54)
df_diff.quantile(0.9)
df_diff.max()
ratings_0["rapid/zzzvvv"]
ratings_adv["rapid/zzzvvv"]
Same analysis, but for individuals with at least 100 games:
df_diff_many_games = pandas.DataFrame(
((player, by_player_adv[player].avg() - by_player_0[player].avg())
for player in by_player_0
if by_player_adv[player].num_games > 100),
columns=["player", "diff"]
)
df_diff_many_games.hist(bins=50)
df_diff_many_games.quantile(0.5)
df_diff_many_games.quantile(0.54)
df_diff_many_games.quantile(0.9)
df_diff_many_games.max()
ratings_0["rapid/zyk67"]
ratings_adv["rapid/zyk67"]
Looking at the difference between the ratings:
df_rating_diff = pandas.DataFrame(
((player, ratings_adv[player].rating - ratings_0[player].rating)
for player in ratings_0
if ratings_0[player].games > 12),
columns=["player", "rating diff"]
)
df_rating_diff.hist(bins=50)
df_rating_diff.min()
df_rating_diff.max()
df_rating_diff.quantile(0.1)