dashee87 / blogscripts Goto Github PK
View Code? Open in Web Editor NEWRepository for code used in my blog posts
License: MIT License
Repository for code used in my blog posts
License: MIT License
Blogpost and scripts are using http://www.football-data.co.uk/mmz4281/1718/E0.csv
and that is not working. To solve that one should use https://www.football-data.co.uk/mmz4281/1718/E0.csv
. Just sharing that in case one encounter with same issue. ๐ค
epl_id <-fdo_listComps(season = 2016,response = "minified") %>% filter(league=="PL")
gives this error
Error in filter(., league == "PL") : object 'league' not found
In addition: Warning message:
In storage.mode(x) <- "double" : NAs introduced by coercion
Hi,
thanks for your very useful blog posts! We've been using them for teaching our software-practicals. One thing that regularly comes up is that the Dixon-Coles implementation is extremely slow. This is due to the use of python loops.
The performance can be significantly increased by making use of vectorization and pulling calculations that can be done ahead-of-time out of the optimization objective function. Most prominently, this includes looking up the correct team indices for each match. There are a few other tricks with diminishing returns. In total the runtime can be sped up by a factor of about 1000.
I got the impression this repository is not really maintained so I'm not going through the effort of a PR, just stating this for other visitors.
My own implementation is below (with somewhat different DataFrame column names and parameter dict):
import numpy as np
import pandas as pd
from scipy.stats import poisson
from scipy.optimize import minimize
from datetime import datetime
def predict_outcome_probs(
params: dict,
team1: str,
team2: str,
) -> np.ndarray:
"""
Predict outcome probabilities ``[p_home, p_draw, p_away]`` according to
Dixon-Coles.
Args:
params: trained parameters returned by ``train_parameters``
team1: name of the home team
team2: name of the away team
"""
probs = predict_score_probs(params, team1, team2)
return np.array([
np.tril(probs, -1).sum(),
np.diag(probs).sum(),
np.triu(probs, 1).sum(),
])
def train_parameters(
matches: pd.DataFrame,
tau: float = None,
t0: datetime = None,
initial: dict = None,
options: dict = {"maxiter": 100},
) -> dict:
"""
Train Dixon-Coles model and return trained parameter values.
Args:
matches: dataframe with columns Team1, Team2, Score1, Score2, Datetime
tau: mean lifetime for weight decay of the match importance
t0: reference time for weight decay
initial: initial parameter values
options: optimizer options
"""
teams = np.unique([matches.Team1, matches.Team2])
team1 = pd.Categorical(matches.Team1, categories=teams).codes
team2 = pd.Categorical(matches.Team2, categories=teams).codes
score1 = matches.Score1.values
score2 = matches.Score2.values
if tau is None:
weights = 1
else:
if t0 is None:
t0 = datetime.now()
time = (t0 - matches.Datetime).dt.days
weights = np.exp(-time / tau)
n_teams = len(teams)
if initial is None:
initial = np.concatenate((
np.random.uniform(0, 1, n_teams), # attack
np.random.uniform(0, -1, n_teams), # defense
np.array([1.0, 0.0]), # home, rho
))
def objective(params):
attack_coefs = params[:n_teams]
defend_coefs = params[n_teams:-2]
home, rho = params[-2:]
return -dc_log_likelihood(
score1, score2,
attack_coefs[team1],
defend_coefs[team1],
attack_coefs[team2],
defend_coefs[team2],
home, rho, weights,
).sum()
opt_output = minimize(
objective,
initial,
options=options,
method='L-BFGS-B',
)
return {
'base': 0,
'attack': dict(zip(teams, opt_output.x[:n_teams])),
'defense': dict(zip(teams, opt_output.x[n_teams:-2])),
'home': opt_output.x[-1],
'rho': opt_output.x[-2],
}
def rho_correction(goals1, goals2, lambda1, lambda2, rho):
"""Correction term for the 0:0, 0:1, 1:0, 1:1 probabilities."""
# rho_min = max(-1/lambda1, -1/lambda2)
# rho_max = min(1/(lambda1 * lambda2), 1)
rho_min = -1 / np.maximum(lambda1.max(), lambda2.max())
rho_max = 1 / np.maximum((lambda1 * lambda2).max(), 1.0)
rho = np.clip(rho, rho_min + 1e-3, rho_max - 1e-3)
x_corr = np.where(goals1 == 0, -lambda1, 1.0)
y_corr = np.where(goals2 == 0, lambda2, -1.0)
r_corr = np.where((goals1 < 2) & (goals2 < 2), rho, 0.0)
return 1 + x_corr * y_corr * r_corr
def dc_log_likelihood(
goals1, goals2,
attack1, defend1,
attack2, defend2,
rho, home,
weights=1,
):
"""Log likelihood of the Dixon-Coles model."""
lambda1 = np.exp(attack1 + defend2 + home)
lambda2 = np.exp(attack2 + defend1)
return weights * (
np.log(rho_correction(goals1, goals2, lambda1, lambda2, rho)) +
poisson.logpmf(goals1, lambda1) +
poisson.logpmf(goals2, lambda2)
)
def predict_score_probs(params, team1, team2, max_goals=10):
"""Return matrix of score probabilities for the Dixon-Coles model."""
output_matrix, (avg1, avg2) = predict_score_probs_poisson(
params, team1, team2, max_goals)
output_matrix[:2, :2] *= rho_correction(
np.arange(2)[:, None],
np.arange(2)[None, :],
avg1, avg2,
params['rho'],
)
return output_matrix
def predict_score_probs_poisson(params, team1, team2, max_goals=10):
"""Return matrix of score probabilities for the poisson model."""
goals_average = get_expected_goals(params, team1, team2)
home_goals, away_goals = poisson.pmf(
np.arange(max_goals + 1).reshape((1, -1)),
goals_average.reshape((2, 1)))
return np.outer(home_goals, away_goals), goals_average
def get_expected_goals(params, team1, team2):
"""Return expected goal rates for either team in this matchup."""
base = params['base']
home = params['home']
attack = params['attack']
defense = params['defense']
return np.array([
np.exp(base + attack.get(team1, 0) + defense.get(team2, 0) + home),
np.exp(base + defense.get(team1, 0) + attack.get(team2, 0))
])
This can be made even slightly faster, e.g. by using jax:
--- dixon_coles.py 2022-01-06 15:31:29.235639125 +0100
+++ dixon_coles_jax.py 2022-01-06 15:40:24.100362012 +0100
@@ -1,11 +1,16 @@
+import jax
+import jax.numpy as jnp
import numpy as np
import pandas as pd
-from scipy.stats import poisson
+from jax.scipy.stats import poisson
from scipy.optimize import minimize
from datetime import datetime
+jax.config.update("jax_enable_x64", True)
+
+
def predict_outcome_probs(
params: dict,
team1: str,
@@ -57,7 +62,7 @@
if t0 is None:
t0 = datetime.now()
time = (t0 - matches.Datetime).dt.days
- weights = np.exp(-time / tau)
+ weights = jnp.array(np.exp(-time / tau))
n_teams = len(teams)
if initial is None:
@@ -67,7 +72,9 @@
np.array([1.0, 0.0]), # home, rho
))
- def objective(params):
+ @jax.jit
+ @jax.value_and_grad
+ def objective_jax(params):
attack_coefs = params[:n_teams]
defend_coefs = params[n_teams:-2]
home, rho = params[-2:]
@@ -80,11 +87,17 @@
home, rho, weights,
).sum()
+ def objective(x):
+ loss, grad = objective_jax(jnp.array(x, dtype=jnp.float64))
+ return (np.array(loss, dtype=np.float64),
+ np.array(grad, dtype=np.float64))
+
opt_output = minimize(
objective,
initial,
options=options,
method='L-BFGS-B',
+ jac=True,
)
return {
'base': 0,
@@ -99,13 +112,13 @@
"""Correction term for the 0:0, 0:1, 1:0, 1:1 probabilities."""
# rho_min = max(-1/lambda1, -1/lambda2)
# rho_max = min(1/(lambda1 * lambda2), 1)
- rho_min = -1 / np.maximum(lambda1.max(), lambda2.max())
- rho_max = 1 / np.maximum((lambda1 * lambda2).max(), 1.0)
- rho = np.clip(rho, rho_min + 1e-3, rho_max - 1e-3)
-
- x_corr = np.where(goals1 == 0, -lambda1, 1.0)
- y_corr = np.where(goals2 == 0, lambda2, -1.0)
- r_corr = np.where((goals1 < 2) & (goals2 < 2), rho, 0.0)
+ rho_min = -1 / jnp.maximum(lambda1.max(), lambda2.max())
+ rho_max = 1 / jnp.maximum((lambda1 * lambda2).max(), 1.0)
+ rho = jnp.clip(rho, rho_min + 1e-3, rho_max - 1e-3)
+
+ x_corr = jnp.where(goals1 == 0, -lambda1, 1.0)
+ y_corr = jnp.where(goals2 == 0, lambda2, -1.0)
+ r_corr = jnp.where((goals1 < 2) & (goals2 < 2), rho, 0.0)
return 1 + x_corr * y_corr * r_corr
@@ -117,10 +130,10 @@
weights=1,
):
"""Log likelihood of the Dixon-Coles model."""
- lambda1 = np.exp(attack1 + defend2 + home)
- lambda2 = np.exp(attack2 + defend1)
+ lambda1 = jnp.exp(attack1 + defend2 + home)
+ lambda2 = jnp.exp(attack2 + defend1)
return weights * (
- np.log(rho_correction(goals1, goals2, lambda1, lambda2, rho)) +
+ jnp.log(rho_correction(goals1, goals2, lambda1, lambda2, rho)) +
poisson.logpmf(goals1, lambda1) +
poisson.logpmf(goals2, lambda2)
)
Hey dashee87,
I was wondering if you have the code to generate the images and/or the gifs on your github for the clustering blog post (which by the way is excellent).
thanks for the code, but it does not work:
Show us a prediction for next day that is not in the test set, totally new, can you do that? I think nooo
In bolack in[13], ax1.plot(market_info[market_info['Date']>= split_date]['Date'].astype(datetime.datetime), market_info[(market_info['Date']+ datetime.timedelta(days=1))>= split_date]['bt_Close'].values[1:] * (1+bt_random_steps), label='Predicted')
Why choose to use values[1:]
here? shouldn't it be values[:-1]
?
use values[1:]
then no meaning to get data 1 day before split_date by (market_info['Date']+ datetime.timedelta(days=1))>= split_date
, [1:]
will skip the first day, start apply random walk on actual_data(t), not actual_date(t-1).
Dear Sir,
Very appreciate your efforts.
When I am running "2017-06-04-predicting-football-results-with-statistical-modelling", I get below errors. I used Anaconda framework. Even I run at command prompt, I still get this error. Python version 3.6.4 64 bit and it is running on Mac. Can you pls. help to me. I an very new to Python.
Best Regards,
Zaw
TypeError Traceback (most recent call last)
in ()
12 sun_away_pois = [poisson.pmf(i,np.sum(np.multiply(sun_home.values.T,sun_home.index.T),axis=1)[0]) for i in range(8)]
13
---> 14 ax1.bar(chel_home.index-0.4,chel_home.values,width=0.4,color="#034694",label="Chelsea")
15 ax1.bar(sun_home.index,sun_home.values,width=0.4,color="#EB172B",label="Sunderland")
16 pois1, = ax1.plot([i for i in range(8)], chel_home_pois,
~/anaconda3/lib/python3.6/site-packages/matplotlib/init.py in inner(ax, *args, **kwargs)
1708 warnings.warn(msg % (label_namer, func.name),
1709 RuntimeWarning, stacklevel=2)
-> 1710 return func(ax, *args, **kwargs)
1711 pre_doc = inner.doc
1712 if pre_doc is None:
~/anaconda3/lib/python3.6/site-packages/matplotlib/axes/_axes.py in bar(self, *args, **kwargs)
2146 edgecolor=e,
2147 linewidth=lw,
-> 2148 label='nolegend',
2149 )
2150 r.update(kwargs)
~/anaconda3/lib/python3.6/site-packages/matplotlib/patches.py in init(self, xy, width, height, angle, **kwargs)
687 """
688
--> 689 Patch.init(self, **kwargs)
690
691 self._x = xy[0]
~/anaconda3/lib/python3.6/site-packages/matplotlib/patches.py in init(self, edgecolor, facecolor, color, linewidth, linestyle, antialiased, hatch, fill, capstyle, joinstyle, **kwargs)
131 self.set_fill(fill)
132 self.set_linestyle(linestyle)
--> 133 self.set_linewidth(linewidth)
134 self.set_antialiased(antialiased)
135 self.set_hatch(hatch)
~/anaconda3/lib/python3.6/site-packages/matplotlib/patches.py in set_linewidth(self, w)
379 w = mpl.rcParams['axes.linewidth']
380
--> 381 self._linewidth = float(w)
382 # scale the dash pattern by the linewidth
383 offset, ls = self._us_dashes
TypeError: only length-1 arrays can be converted to Python scalars
Thanks for the code and the explanations, but can I incorporate the Dixon-Coles correction into the basic Poisson model, can you do that and upload the code or show me how to go about it?
Hi, thanks for the software but,
I'm getting an error that I can not solve or why i'm getting it
I'm already reinstall Pandas and matplotlib but I get the same error
What I'm missing? do you have something on you blog like stuff to install before start, actualy I dont find here references to you blog, only to other blogs.
anyway this is my error can you help me solving it?
When I run the In[7] I get this error
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
~/.local/lib/python3.5/site-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
3077 try:
-> 3078 return self._engine.get_loc(key)
3079 except KeyError:
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
KeyError: 'bt_Open'
During handling of the above exception, another exception occurred:
KeyError Traceback (most recent call last)
<ipython-input-7-1d7320c9a250> in <module>()
8 ax2.set_xticks([datetime.date(i,j,1) for i in range(2013,2019) for j in [1,7]])
9 ax2.set_xticklabels([datetime.date(i,j,1).strftime('%b %Y') for i in range(2013,2019) for j in [1,7]])
---> 10 ax1.plot(bitcoin_market_info['Date'].astype(datetime.datetime),bitcoin_market_info['bt_Open'])
11 ax2.bar(bitcoin_market_info['Date'].astype(datetime.datetime).values, bitcoin_market_info['bt_Volume'].values)
12 fig.tight_layout()
~/.local/lib/python3.5/site-packages/pandas/core/frame.py in __getitem__(self, key)
2686 return self._getitem_multilevel(key)
2687 else:
-> 2688 return self._getitem_column(key)
2689
2690 def _getitem_column(self, key):
~/.local/lib/python3.5/site-packages/pandas/core/frame.py in _getitem_column(self, key)
2693 # get column
2694 if self.columns.is_unique:
-> 2695 return self._get_item_cache(key)
2696
2697 # duplicate columns & possible reduce dimensionality
~/.local/lib/python3.5/site-packages/pandas/core/generic.py in _get_item_cache(self, item)
2487 res = cache.get(item)
2488 if res is None:
-> 2489 values = self._data.get(item)
2490 res = self._box_item_values(item, values)
2491 cache[item] = res
~/.local/lib/python3.5/site-packages/pandas/core/internals.py in get(self, item, fastpath)
4113
4114 if not isna(item):
-> 4115 loc = self.items.get_loc(item)
4116 else:
4117 indexer = np.arange(len(self.items))[isna(self.items)]
~/.local/lib/python3.5/site-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
3078 return self._engine.get_loc(key)
3079 except KeyError:
-> 3080 return self._engine.get_loc(self._maybe_cast_indexer(key))
3081
3082 indexer = self.get_indexer([key], method=method, tolerance=tolerance)
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
KeyError: 'bt_Open'
Hey there is there some wear a requirements.txt to install all needed packages?
I tried play with notebook from Github. Why I am getting following error when I run following cell:
OSError Traceback (most recent call last)
in ()
7 bt_preds = []
8 for rand_seed in range(775,800):
----> 9 temp_model = load_model('eth_model_randseed_%d.h5'%rand_seed)
10 eth_preds.append(np.mean(abs(np.transpose(temp_model.predict(LSTM_test_inputs))-
11 (test_set['eth_Close'].values[window_len:]/test_set['eth_Close'].values[:-window_len]-1))))
~/anaconda3/lib/python3.6/site-packages/keras/models.py in load_model(filepath, custom_objects, compile)
231 return custom_objects[obj]
232 return obj
--> 233 with h5py.File(filepath, mode='r') as f:
234 # instantiate model
235 model_config = f.attrs.get('model_config')
~/anaconda3/lib/python3.6/site-packages/h5py/_hl/files.py in init(self, name, mode, driver, libver, userblock_size, swmr, **kwds)
267 with phil:
268 fapl = make_fapl(driver, libver, **kwds)
--> 269 fid = make_fid(name, mode, userblock_size, fapl, swmr=swmr)
270
271 if swmr_support:
~/anaconda3/lib/python3.6/site-packages/h5py/_hl/files.py in make_fid(name, mode, userblock_size, fapl, fcpl, swmr)
97 if swmr and swmr_support:
98 flags |= h5f.ACC_SWMR_READ
---> 99 fid = h5f.open(name, flags, fapl=fapl)
100 elif mode == 'r+':
101 fid = h5f.open(name, h5f.ACC_RDWR, fapl=fapl)
h5py/_objects.pyx in h5py._objects.with_phil.wrapper()
h5py/_objects.pyx in h5py._objects.with_phil.wrapper()
h5py/h5f.pyx in h5py.h5f.open()
OSError: Unable to open file (unable to open file: name = 'eth_model_randseed_775.h5', errno = 2, error message = 'No such file or directory', flags = 0, o_flags = 0)
A declarative, efficient, and flexible JavaScript library for building user interfaces.
๐ Vue.js is a progressive, incrementally-adoptable JavaScript framework for building UI on the web.
TypeScript is a superset of JavaScript that compiles to clean JavaScript output.
An Open Source Machine Learning Framework for Everyone
The Web framework for perfectionists with deadlines.
A PHP framework for web artisans
Bring data to life with SVG, Canvas and HTML. ๐๐๐
JavaScript (JS) is a lightweight interpreted programming language with first-class functions.
Some thing interesting about web. New door for the world.
A server is a program made to process requests and deliver data to clients.
Machine learning is a way of modeling and interpreting data that allows a piece of software to respond intelligently.
Some thing interesting about visualization, use data art
Some thing interesting about game, make everyone happy.
We are working to build community through open source technology. NB: members must have two-factor auth.
Open source projects and samples from Microsoft.
Google โค๏ธ Open Source for everyone.
Alibaba Open Source for everyone
Data-Driven Documents codes.
China tencent open source team.